Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PDF Metadata Support #3552

Merged
merged 11 commits into from
Feb 16, 2025
43 changes: 43 additions & 0 deletions API.Tests/Services/BookServiceTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,47 @@ public void ShouldParseAsVolumeGroup_WithSeriesIndex()
Assert.Equal("Accel World", comicInfo.Series);
}

[Fact]
public void ShouldHaveComicInfoForPdf()
{
var testDirectory = Path.Join(Directory.GetCurrentDirectory(), "../../../Services/Test Data/BookService");
var document = Path.Join(testDirectory, "test.pdf");
var comicInfo = _bookService.GetComicInfo(document);
Assert.NotNull(comicInfo);
Assert.Equal("Variations Chromatiques de concert", comicInfo.Title);
Assert.Equal("Georges Bizet \\(1838-1875\\)", comicInfo.Writer);
}

// TODO: Get the file from microtherion
// [Fact]
// public void ShouldUsePdfInfoDict()
// {
// var testDirectory = Path.Join(Directory.GetCurrentDirectory(), "../../../Services/Test Data/ScannerService/Library/Books/PDFs");
// var document = Path.Join(testDirectory, "Rollo at Work SP01.pdf");
// var comicInfo = _bookService.GetComicInfo(document);
// Assert.NotNull(comicInfo);
// Assert.Equal("Rollo at Work", comicInfo.Title);
// Assert.Equal("Jacob Abbott", comicInfo.Writer);
// Assert.Equal(2008, comicInfo.Year);
// }

[Fact]
public void ShouldHandleIndirectPdfObjects()
{
var testDirectory = Path.Join(Directory.GetCurrentDirectory(), "../../../Services/Test Data/BookService");
var document = Path.Join(testDirectory, "indirect.pdf");
var comicInfo = _bookService.GetComicInfo(document);
Assert.NotNull(comicInfo);
Assert.Equal(2018, comicInfo.Year);
Assert.Equal(8, comicInfo.Month);
}

[Fact]
public void FailGracefullyWithEncryptedPdf()
{
var testDirectory = Path.Join(Directory.GetCurrentDirectory(), "../../../Services/Test Data/BookService");
var document = Path.Join(testDirectory, "encrypted.pdf");
var comicInfo = _bookService.GetComicInfo(document);
Assert.Null(comicInfo);
}
}
17 changes: 16 additions & 1 deletion API.Tests/Services/ScannerServiceTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,22 @@ public async Task ScanLibrary_FlatSeries()
[Fact]
public async Task ScanLibrary_FlatSeriesWithSpecialFolder()
{
var testcase = "Flat Series with Specials Folder - Manga.json";
var testcase = "Flat Series with Specials Folder Alt Naming - Manga.json";
var library = await _scannerHelper.GenerateScannerData(testcase);
var scanner = _scannerHelper.CreateServices();
await scanner.ScanLibrary(library.Id);
var postLib = await _unitOfWork.LibraryRepository.GetLibraryForIdAsync(library.Id, LibraryIncludes.Series);

Assert.NotNull(postLib);
Assert.Single(postLib.Series);
Assert.Equal(4, postLib.Series.First().Volumes.Count);
Assert.NotNull(postLib.Series.First().Volumes.FirstOrDefault(v => v.Chapters.FirstOrDefault(c => c.IsSpecial) != null));
}

[Fact]
public async Task ScanLibrary_FlatSeriesWithSpecialFolder_AlternativeNaming()
{
var testcase = "Flat Series with Specials Folder Alt Naming - Manga.json";
var library = await _scannerHelper.GenerateScannerData(testcase);
var scanner = _scannerHelper.CreateServices();
await scanner.ScanLibrary(library.Id);
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[
"My Dress-Up Darling/My Dress-Up Darling v01.cbz",
"My Dress-Up Darling/My Dress-Up Darling v02.cbz",
"My Dress-Up Darling/My Dress-Up Darling ch 10.cbz",
"My Dress-Up Darling/Specials/My Dress-Up Darling - Omakes SP01.cbz"
]
159 changes: 159 additions & 0 deletions API/Helpers/PdfComicInfoExtractor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/// Translate PDF metadata (See PdfMetadataExtractor.cs) into ComicInfo structure.

// Contributed by https://github.com/microtherion

// All references to the "PDF Spec" (section numbers, etc) refer to the
// PDF 1.7 Specification a.k.a. PDF32000-1:2008
// https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf

using System;
using System.Xml;
using System.Text;
using System.IO;
using System.Diagnostics;
using API.Data.Metadata;
using API.Entities.Enums;
using API.Services;
using API.Services.Tasks.Scanner.Parser;
using Microsoft.Extensions.Logging;
using Nager.ArticleNumber;
using System.Collections.Generic;

namespace API.Helpers;
#nullable enable

public interface IPdfComicInfoExtractor
{
ComicInfo? GetComicInfo(string filePath);
}

public class PdfComicInfoExtractor : IPdfComicInfoExtractor
{
private readonly ILogger<BookService> _logger;
private readonly IMediaErrorService _mediaErrorService;
private readonly string[] _pdfDateFormats = [ // PDF Spec 7.9.4
"D:yyyyMMddHHmmsszzz:", "D:yyyyMMddHHmmss+", "D:yyyyMMddHHmmss",
"D:yyyyMMddHHmmzzz:", "D:yyyyMMddHHmm+", "D:yyyyMMddHHmm",
"D:yyyyMMddHHzzz:", "D:yyyyMMddHH+", "D:yyyyMMddHH",
"D:yyyyMMdd", "D:yyyyMM", "D:yyyy"
];

public PdfComicInfoExtractor(ILogger<BookService> logger, IMediaErrorService mediaErrorService)
{
_logger = logger;
_mediaErrorService = mediaErrorService;
}

private float? GetFloatFromText(string? text)
{
if (string.IsNullOrEmpty(text)) return null;

if (float.TryParse(text, out var value)) return value;

return null;
}

private DateTime? GetDateTimeFromText(string? text)
{
if (string.IsNullOrEmpty(text)) return null;

// Dates stored in the XMP metadata stream (PDF Spec 14.3.2)
// are stored in ISO 8601 format, which is handled by C# out of the box
if (DateTime.TryParse(text, out var date)) return date;

// Dates stored in the document information directory (PDF Spec 14.3.3)
// are stored in a proprietary format (PDF Spec 7.9.4) that needs to be
// massaged slightly to be expressible by a DateTime format.
if (text[0] != 'D') {
text = "D:" + text;
}
text = text.Replace("'", ":");
text = text.Replace("Z", "+");

foreach(var format in _pdfDateFormats)
{
if (DateTime.TryParseExact(text, format, null, System.Globalization.DateTimeStyles.None, out var pdfDate)) return pdfDate;
}

return null;
}

private string? MaybeGetMetadata(Dictionary<string, string> metadata, string key)
{
return metadata.ContainsKey(key) ? metadata[key] : null;
}

private ComicInfo? GetComicInfoFromMetadata(Dictionary<string, string> metadata, string filePath)
{
var info = new ComicInfo();

var publicationDate = GetDateTimeFromText(MaybeGetMetadata(metadata, "CreationDate"));

if (publicationDate != null)
{
info.Year = publicationDate.Value.Year;
info.Month = publicationDate.Value.Month;
info.Day = publicationDate.Value.Day;
}

info.Summary = MaybeGetMetadata(metadata, "Summary") ?? string.Empty;
info.Publisher = MaybeGetMetadata(metadata, "Publisher") ?? string.Empty;
info.Writer = MaybeGetMetadata(metadata, "Author") ?? string.Empty;
info.Title = MaybeGetMetadata(metadata, "Title") ?? string.Empty;
info.Genre = MaybeGetMetadata(metadata, "Subject") ?? string.Empty;
info.LanguageISO = BookService.ValidateLanguage(MaybeGetMetadata(metadata, "Language"));
info.Isbn = MaybeGetMetadata(metadata, "ISBN") ?? string.Empty;

if (info.Isbn != string.Empty && !ArticleNumberHelper.IsValidIsbn10(info.Isbn) && !ArticleNumberHelper.IsValidIsbn13(info.Isbn))
{
_logger.LogDebug("[BookService] {File} has an invalid ISBN number", filePath);
info.Isbn = string.Empty;
}

info.UserRating = GetFloatFromText(MaybeGetMetadata(metadata, "UserRating")) ?? 0.0f;
info.TitleSort = MaybeGetMetadata(metadata, "TitleSort") ?? string.Empty;
info.Series = MaybeGetMetadata(metadata, "Series") ?? info.TitleSort;
info.SeriesSort = info.Series;
info.Volume = (GetFloatFromText(MaybeGetMetadata(metadata, "Volume")) ?? 0.0f).ToString();

// If this is a single book and not a collection, set publication status to Completed
if (string.IsNullOrEmpty(info.Volume) && Parser.ParseVolume(filePath, LibraryType.Manga).Equals(Parser.LooseLeafVolume))
{
info.Count = 1;
}

// Removed as probably unneeded per discussion in https://github.com/Kareadita/Kavita/pull/3108#discussion_r1956747782
//
// var hasVolumeInSeries = !Parser.ParseVolume(info.Title, LibraryType.Manga)
// .Equals(Parser.LooseLeafVolume);

// if (string.IsNullOrEmpty(info.Volume) && hasVolumeInSeries && (!info.Series.Equals(info.Title) || string.IsNullOrEmpty(info.Series)))
// {
// // This is likely a light novel for which we can set series from parsed title
// info.Series = Parser.ParseSeries(info.Title, LibraryType.Manga);
// info.Volume = Parser.ParseVolume(info.Title, LibraryType.Manga);
// }

ComicInfo.CleanComicInfo(info);

return info;
}

public ComicInfo? GetComicInfo(string filePath)
{
try
{
var extractor = new PdfMetadataExtractor(_logger, filePath);

return GetComicInfoFromMetadata(extractor.GetMetadata(), filePath);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "[GetComicInfo] There was an exception parsing PDF metadata for {File}", filePath);
_mediaErrorService.ReportMediaIssue(filePath, MediaErrorProducer.BookService,
"There was an exception parsing PDF metadata", ex);
}

return null;
}
}
Loading
Loading