Skip to content

Commit

Permalink
Merge pull request #16 from JakeYallop/key-by-page-url
Browse files Browse the repository at this point in the history
Update page store key to include UrlKey
  • Loading branch information
JakeYallop authored May 24, 2024
2 parents 56085ad + 55dfc5b commit 3fe21b2
Showing 1 changed file with 26 additions and 7 deletions.
33 changes: 26 additions & 7 deletions WaybackDownloader/Services/PageWorker.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
using System.Threading.Channels;
using System.Diagnostics.CodeAnalysis;
using System.Threading.Channels;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Polly;
using Spectre.Console;

namespace WaybackDownloader.Services;

Expand All @@ -19,6 +19,24 @@ ILogger<PageWorker> logger
private static readonly Dictionary<string, Dictionary<string, int>> PathMap = [];
private static readonly SemaphoreSlim PathMapSemaphore = new(1, 1);

private readonly struct PageKey : IEquatable<PageKey>
{
private PageKey(string key)
{
Value = key;
}
public PageKey(string key, string normalizedPath) : this($"{key}-{normalizedPath}") { }

public string Value { get; }

public bool Equals(PageKey other) => string.Equals(Value, other.Value, StringComparison.OrdinalIgnoreCase);
public override bool Equals([NotNullWhen(true)] object? obj) => obj is PageKey other && Equals(other);
public override int GetHashCode() => Value.GetHashCode(StringComparison.OrdinalIgnoreCase);

public static explicit operator string(PageKey key) => key.Value;
public static PageKey UnsafeFromString(string key) => new(key);
}

private readonly ChannelReader<CdxRecord> _reader = channel.Reader;
public async Task StartAsync(string outputDir, CancellationToken cancellationToken)
{
Expand All @@ -39,10 +57,11 @@ public async Task StartAsync(string outputDir, CancellationToken cancellationTok
logger.UrlTransformed(record.Original, normalizedPath);

var writePath = Path.Combine(outputDir, normalizedPath);
var foundPage = pagesStore.TryGetDownloadedPageTimestamp(normalizedPath, out var timestamp);
var pageKey = new PageKey(record.UrlKey, normalizedPath);
var foundPage = pagesStore.TryGetDownloadedPageTimestamp(pageKey.Value, out var timestamp);
if (!foundPage || timestamp < record.Timestamp)
{
await TryWritePageAsync(record, writePath, normalizedPath, foundPage, CancellationToken.None).ConfigureAwait(false);
await TryWritePageAsync(record, writePath, normalizedPath, pageKey, foundPage, CancellationToken.None).ConfigureAwait(false);
}
else
{
Expand All @@ -55,7 +74,7 @@ public async Task StartAsync(string outputDir, CancellationToken cancellationTok
logger.ExitingWorker();
}

private async Task TryWritePageAsync(CdxRecord record, string writePath, string normalizedPath, bool isUpdateToExistingPage, CancellationToken cancellationToken)
private async Task TryWritePageAsync(CdxRecord record, string writePath, string normalizedPath, PageKey pageKey, bool isUpdateToExistingPage, CancellationToken cancellationToken)
{
//max path length
if (writePath.Length > 260 - Path.GetExtension(writePath).Length - 7)
Expand All @@ -76,7 +95,7 @@ private async Task TryWritePageAsync(CdxRecord record, string writePath, string

using var fs = new FileStream(writePath, FileMode.OpenOrCreate, FileAccess.Write, FileShare.None);
//now we have a file lock, check if our timestamp is greater than any recently newly added timestamps.
if (pagesStore.TryGetDownloadedPageTimestamp(normalizedPath, out var updatedTimestamp) && updatedTimestamp >= record.Timestamp)
if (pagesStore.TryGetDownloadedPageTimestamp(pageKey.Value, out var updatedTimestamp) && updatedTimestamp >= record.Timestamp)
{
return;
}
Expand Down Expand Up @@ -112,7 +131,7 @@ private async Task TryWritePageAsync(CdxRecord record, string writePath, string
logger.WrittenPage(writePath, record.Original, record.Timestamp);
IncrementCounter(isUpdateToExistingPage);
}
pagesStore.AddPage(normalizedPath, record.Timestamp);
pagesStore.AddPage(pageKey.Value, record.Timestamp);

static void IncrementCounter(bool isUpdateToExistingPage)
{
Expand Down

0 comments on commit 3fe21b2

Please sign in to comment.