-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 4f4a970
Showing
6 changed files
with
1,216 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
|
||
# Created by https://www.gitignore.io/api/node | ||
|
||
### Node ### | ||
# Logs | ||
logs | ||
*.log | ||
npm-debug.log* | ||
yarn-debug.log* | ||
yarn-error.log* | ||
|
||
# Runtime data | ||
pids | ||
*.pid | ||
*.seed | ||
*.pid.lock | ||
|
||
# Directory for instrumented libs generated by jscoverage/JSCover | ||
lib-cov | ||
|
||
# Coverage directory used by tools like istanbul | ||
coverage | ||
|
||
# nyc test coverage | ||
.nyc_output | ||
|
||
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) | ||
.grunt | ||
|
||
# Bower dependency directory (https://bower.io/) | ||
bower_components | ||
|
||
# node-waf configuration | ||
.lock-wscript | ||
|
||
# Compiled binary addons (http://nodejs.org/api/addons.html) | ||
build/Release | ||
|
||
# Dependency directories | ||
node_modules/ | ||
jspm_packages/ | ||
|
||
# Typescript v1 declaration files | ||
typings/ | ||
|
||
# Optional npm cache directory | ||
.npm | ||
|
||
# Optional eslint cache | ||
.eslintcache | ||
|
||
# Optional REPL history | ||
.node_repl_history | ||
|
||
# Output of 'npm pack' | ||
*.tgz | ||
|
||
# Yarn Integrity file | ||
.yarn-integrity | ||
|
||
# dotenv environment variables file | ||
.env | ||
|
||
|
||
# End of https://www.gitignore.io/api/node |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/usr/bin/env node | ||
require('dotenv').config(); | ||
const argv = require('yargs') | ||
.usage('Usage: $0 [options]') | ||
.option('url', { | ||
alias: 'u', | ||
describe: 'The URL the crawler should enter the site from', | ||
}) | ||
.option('resume', { | ||
alias: 'r', | ||
boolean: true, | ||
describe: 'Resume crawler from existing queue', | ||
}) | ||
.option('max-radius', { | ||
alias: 'm', | ||
number: true, | ||
describe: 'The maximum radius from the entry URL to crawl', | ||
default: Infinity, | ||
}) | ||
.check(({ url, resume }) => { | ||
if (url && resume) { | ||
throw new Error('--url and --resume are mutually exclusive'); | ||
} | ||
return true; | ||
}) | ||
.help().argv; | ||
|
||
const db = require('./db'); | ||
const crawl = require('./crawler'); | ||
|
||
(async () => { | ||
if (!argv.resume) { | ||
await db.flush(); | ||
} | ||
await crawl(argv.url, { maxRadius: argv.maxRadius }); | ||
db.close(); | ||
})(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
const _ = require('lodash'); | ||
const db = require('./db'); | ||
const puppeteer = require('puppeteer'); | ||
const url = require('url'); | ||
|
||
const debug = { | ||
crawl: require('debug')('crawler:crawl'), | ||
page: require('debug')('crawler:page'), | ||
}; | ||
|
||
const crawl = async (entry, options = {}) => { | ||
debug.crawl('Crawler started'); | ||
let target = (await db.getCrawlUrl()) || { url: entry, radius: 0 }; | ||
const { maxRadius = Infinity } = options; | ||
if (!target.url) { | ||
debug.crawl('Nothing to crawl'); | ||
return; | ||
} | ||
|
||
const entryUrl = url.parse(target.url); | ||
const browser = await puppeteer.launch(); | ||
const page = await browser.newPage(); | ||
debug.crawl('Puppeteer started'); | ||
|
||
let count = 0; | ||
while (target) { | ||
if (target.radius >= maxRadius) { | ||
debug.page(`Max radius reached ${target.url} not scraped`); | ||
} else { | ||
count++; | ||
debug.page(`Crawling: ${target.url}`); | ||
await page.goto(target.url); | ||
debug.page(`Page loaded`); | ||
const links = await page.evaluate(() => { | ||
return Array.from(document.querySelectorAll('a')).map( | ||
link => link.href | ||
); | ||
}); | ||
const urls = _.chain(links) | ||
.filter(link => { | ||
return url.parse(link).host === entryUrl.host; | ||
}) | ||
.value(); | ||
debug.page(`Scraped ${urls.length} urls`); | ||
await db.addCrawlUrls(urls, ++target.radius); | ||
} | ||
target = await db.getCrawlUrl(); | ||
} | ||
debug.crawl(`Crawler finished after crawling ${count} pages`); | ||
|
||
browser.close(); | ||
}; | ||
|
||
module.exports = crawl; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
const bluebird = require('bluebird'); | ||
const redis = require('redis'); | ||
|
||
bluebird.promisifyAll(redis.RedisClient.prototype); | ||
bluebird.promisifyAll(redis.Multi.prototype); | ||
|
||
const debug = { | ||
redis: require('debug')('crawler:redis'), | ||
}; | ||
|
||
const client = redis.createClient( | ||
process.env.REDIS_PORT || 6379, | ||
process.env.REDIS_HOST || 'localhost' | ||
); | ||
|
||
module.exports = { | ||
addCrawlUrls: async (urls, radius) => { | ||
debug.redis('Add scraped urls to redis'); | ||
const multi = client.multi(); | ||
urls.forEach(url => { | ||
multi.sadd('discoveredPages', url); | ||
}); | ||
const result = await multi.execAsync(); | ||
debug.redis('Added urls to discovered set'); | ||
|
||
let count = 0; | ||
result.forEach((notDiscovered, i) => { | ||
if (notDiscovered) { | ||
count++; | ||
const url = urls[i]; | ||
multi.rpush('pageQueue', `${url} ${radius}`); | ||
} | ||
}); | ||
await multi.execAsync(); | ||
debug.redis(`Added ${count} new urls to queue`); | ||
debug.redis(`${urls.length - count} duplicates found`); | ||
}, | ||
|
||
addCrawlUrl: async (url, radius) => { | ||
const notDiscovered = await client.saddAsync('discoveredPages', url); | ||
if (!notDiscovered) { | ||
await client.rpushAsync('pageQueue', `${url} ${radius}`); | ||
} | ||
}, | ||
|
||
getCrawlUrl: async () => { | ||
debug.redis('Pop url from queue'); | ||
const reply = await client.lpopAsync('pageQueue'); | ||
if (reply) { | ||
debug.redis('Url popped'); | ||
if (debug.redis.enabled) { | ||
const length = await client.llenAsync('pageQueue'); | ||
debug.redis(`${length} urls in queue`); | ||
} | ||
const parts = reply.match(/(.+) ([0-9]+)$/); | ||
return { | ||
url: parts[1], | ||
radius: parseInt(parts[2]), | ||
}; | ||
} | ||
debug.redis('Queue empty'); | ||
return null; | ||
}, | ||
|
||
flush: async () => { | ||
debug.redis('Flush db'); | ||
await client.del('discoveredPages', 'pageQueue'); | ||
debug.redis('Redis flushed'); | ||
}, | ||
|
||
close: () => { | ||
client.end(true); | ||
}, | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
{ | ||
"name": "crawler", | ||
"version": "1.0.0", | ||
"main": "crawl", | ||
"license": "MIT", | ||
"author": { | ||
"email": "[email protected]", | ||
"name": "Reed Dadoune", | ||
"url": "https://www.dadoune.com" | ||
}, | ||
"scripts": { | ||
"precommit": "lint-staged" | ||
}, | ||
"lint-staged": { | ||
"*.{js, jsx}": ["prettier --single-quote --trailing-comma es5 --write", "git add"] | ||
}, | ||
"dependencies": { | ||
"bluebird": "^3.5.0", | ||
"debug": "^3.0.0", | ||
"dotenv": "^4.0.0", | ||
"lodash": "^4.17.4", | ||
"puppeteer": "^0.9.0", | ||
"redis": "^2.8.0", | ||
"yargs": "^8.0.2" | ||
}, | ||
"devDependencies": { | ||
"husky": "^0.14.3", | ||
"lint-staged": "^4.0.3", | ||
"prettier": "^1.5.3" | ||
} | ||
} |
Oops, something went wrong.