Skip to content

Commit

Permalink
feat: scraping script: add progress bar
Browse files Browse the repository at this point in the history
  • Loading branch information
sititou70 committed Dec 3, 2023
1 parent 6866767 commit b263c9a
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 4 deletions.
31 changes: 31 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"@mui/material": "5.10.12",
"@react-hook/window-size": "3.1.1",
"@types/d3": "7.4.0",
"@types/progress": "^2.0.7",
"@types/react": "18.0.25",
"@types/stats-lite": "2.2.0",
"awesome-sigmoid": "1.0.2",
Expand All @@ -23,6 +24,7 @@
"normalize.css": "8.0.1",
"npm-run-all": "4.1.5",
"prettier-plugin-organize-imports": "3.1.1",
"progress": "^2.0.3",
"react": "18.2.0",
"react-dom": "18.2.0",
"react-ga": "3.3.1",
Expand Down
28 changes: 24 additions & 4 deletions scraping/index.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import fs from 'fs';
import fetch from 'node-fetch';
import path from 'path';
import Progress from 'progress';
import sanitize from 'sanitize-filename';

// settings
const FETCH_INTERVAL = 7000;
const CACHE_DIR = 'TEMP_CACHE';
const INPUT_COUPLINGS_JSON = path.join(
'..',
Expand Down Expand Up @@ -64,12 +66,30 @@ const main = async () => {
JSON.parse(fs.readFileSync(INPUT_COUPLINGS_JSON).toString())
).filter((x) => !isSelfCoupling(x));

const tags_len = couplings.flatMap((coupling) => coupling.tags).length;
const bar = new Progress('[:bar]\t:percent\t:rest_min min\t:tag_name\n', {
complete: '=',
incomplete: ' ',
width: 50,
total: tags_len,
});

let dest_couplings: Couplings = [];
for (const [i, coupling] of couplings.entries()) {
for (const coupling of couplings) {
let tags: Coupling['tags'] = [];
for (let tag of coupling.tags) {
console.log(`[${i} / ${couplings.length}] scraping tag:`, tag.name);
for (const tag of coupling.tags) {
tags.push({ name: tag.name, num: await getNumsFromTag(tag.name) });

bar.tick(1, {
rest_min: (
Math.round(
(((tags_len - bar.curr) * FETCH_INTERVAL) / 1000 / 60) * 100
) / 100
)
.toString()
.padEnd(5, ' '),
tag_name: tag.name,
});
}

dest_couplings = [
Expand Down Expand Up @@ -134,7 +154,7 @@ const getNumsFromTag = async (tag: string): Promise<number> => {
return parseInt(fs.readFileSync(cache_file_path).toString());
} catch (e) {}

await sleep(7000);
await sleep(FETCH_INTERVAL);

const url: string = `https://www.pixiv.net/tags/${encodeURIComponent(tag)}/`;
const row = await fetch(url);
Expand Down

0 comments on commit b263c9a

Please sign in to comment.