From 27eb22fbd20eba07610f6e86d7383e45687d3fcd Mon Sep 17 00:00:00 2001 From: pmespresso Date: Fri, 27 Sep 2024 13:27:42 +0700 Subject: [PATCH 1/2] fix: Improve caching mechanism for URL indexing status - Modify shouldRecheck function to always process non-indexed URLs - Ensure "Submitted and Indexed" URLs are only rechecked after cache timeout - Optimize batch processing to avoid unnecessary API calls --- package-lock.json | 4 ++-- src/index.ts | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/package-lock.json b/package-lock.json index 005c376..caf5217 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "google-indexing-script", - "version": "0.3.0", + "version": "0.4.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "google-indexing-script", - "version": "0.3.0", + "version": "0.4.0", "license": "MIT", "dependencies": { "commander": "^12.1.0", diff --git a/src/index.ts b/src/index.ts index e764d97..9ddbc61 100644 --- a/src/index.ts +++ b/src/index.ts @@ -120,11 +120,20 @@ export const index = async (input: string = process.argv[2], options: IndexOptio ]; const shouldRecheck = (status: Status, lastCheckedAt: string) => { - const shouldIndexIt = indexableStatuses.includes(status); + if (status !== Status.SubmittedAndIndexed) { + return true; + } const isOld = new Date(lastCheckedAt) < new Date(Date.now() - CACHE_TIMEOUT); - return shouldIndexIt && isOld; + return isOld; }; + const urlsToProcess = pages.filter((url) => { + const result = statusPerUrl[url]; + return !result || shouldRecheck(result.status, result.lastCheckedAt); + }); + + console.log(`👉 Found ${urlsToProcess.length} URLs that need processing out of ${pages.length} total URLs`); + await batch( async (url) => { let result = statusPerUrl[url]; @@ -136,7 +145,7 @@ export const index = async (input: string = process.argv[2], options: IndexOptio pagesPerStatus[result.status] = pagesPerStatus[result.status] ? [...pagesPerStatus[result.status], url] : [url]; }, - pages, + urlsToProcess, 50, (batchIndex, batchCount) => { console.log(`📦 Batch ${batchIndex + 1} of ${batchCount} complete`); From 1bcb7c40c1c1cd00a30263c9c88095584927f88f Mon Sep 17 00:00:00 2001 From: pmespresso Date: Fri, 27 Sep 2024 14:44:09 +0700 Subject: [PATCH 2/2] - Introduce short-term cache for quickly fixable statuses (e.g., NotFound) - Maintain efficient long-term caching for stable, indexed URLs - add NotFound to Status enum --- src/index.ts | 46 ++++++++++++++++++++++++++++++++++++--------- src/shared/types.ts | 1 + 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/src/index.ts b/src/index.ts index 9ddbc61..a51a43d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -16,6 +16,39 @@ import { readFileSync, existsSync, mkdirSync, writeFileSync } from "fs"; import path from "path"; const CACHE_TIMEOUT = 1000 * 60 * 60 * 24 * 14; // 14 days +const SHORT_TIMEOUT = 1000 * 60 * 60; // 1 hour + +const indexableStatuses = [ + Status.SubmittedAndIndexed, + Status.CrawledCurrentlyNotIndexed, + Status.DiscoveredCurrentlyNotIndexed, + Status.Forbidden, + Status.Error, + Status.RateLimited, +]; + +const quickFixStatuses = [Status.NotFound, Status.PageWithRedirect]; + +const shouldRecheck = (status: Status, lastCheckedAt: string) => { + const timeSinceLastCheck = Date.now() - new Date(lastCheckedAt).getTime(); + + if (indexableStatuses.includes(status)) { + if (status === Status.SubmittedAndIndexed) { + return timeSinceLastCheck > CACHE_TIMEOUT; + } + // For other indexable statuses, check more frequently + return timeSinceLastCheck > SHORT_TIMEOUT; + } + + if (quickFixStatuses.includes(status)) { + // For statuses that might be quickly fixed, use a shorter timeout + return timeSinceLastCheck > SHORT_TIMEOUT; + } + + // For any other status, use the standard cache timeout + return timeSinceLastCheck > CACHE_TIMEOUT; +}; + export const QUOTA = { rpm: { retries: 3, @@ -108,25 +141,20 @@ export const index = async (input: string = process.argv[2], options: IndexOptio [Status.RateLimited]: [], [Status.Forbidden]: [], [Status.Error]: [], + [Status.NotFound]: [], }; const indexableStatuses = [ - Status.DiscoveredCurrentlyNotIndexed, + Status.SubmittedAndIndexed, Status.CrawledCurrentlyNotIndexed, + Status.DiscoveredCurrentlyNotIndexed, Status.URLIsUnknownToGoogle, Status.Forbidden, Status.Error, Status.RateLimited, + Status.NotFound, ]; - const shouldRecheck = (status: Status, lastCheckedAt: string) => { - if (status !== Status.SubmittedAndIndexed) { - return true; - } - const isOld = new Date(lastCheckedAt) < new Date(Date.now() - CACHE_TIMEOUT); - return isOld; - }; - const urlsToProcess = pages.filter((url) => { const result = statusPerUrl[url]; return !result || shouldRecheck(result.status, result.lastCheckedAt); diff --git a/src/shared/types.ts b/src/shared/types.ts index db66fb6..5572c2c 100644 --- a/src/shared/types.ts +++ b/src/shared/types.ts @@ -11,4 +11,5 @@ export enum Status { RateLimited = "RateLimited", Forbidden = "Forbidden", Error = "Error", + NotFound = "Not found (404)", }