-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathscrape.js
84 lines (73 loc) · 2.35 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
// based on http://anismiles.wordpress.com/2010/11/29/node-js-and-jquery-to-scrape-websites/
// External Modules
var request = require('ahr2'), // Abstract-HTTP-request https://github.com/coolaj86/abstract-http-request
sys = require('sys'), // System
events = require('events'), // EventEmitter
jsdom = require('jsdom'); // JsDom https://github.com/tmpvar/jsdom
var jQueryPath = 'http://code.jquery.com/jquery-2.0.3.min.js';
// var headers = {'content-type':'application/json', 'accept': 'application/json'};
var headers = {};
// Export searcher
module.exports = Searcher;
function Searcher(param) {
if (param.headers) {
this.headers = param.headers;
} else {
this.headers = headers;
}
if (param.getSearchUrl) this.getSearchUrl = param.getSearchUrl;
if (param.parseHTML) this.parseHTML = param.parseHTML;
this.id = param.id;
}
// Inherit from EventEmitter
Searcher.prototype = new process.EventEmitter;
Searcher.prototype.search = function(query, collector) {
var self = this;
var url = self.getSearchUrl(query);
console.log('Connecting to... ' + url);
request({
href: url,
method: 'GET',
headers: self.headers,
timeout: 10000
}).when(function(err, response, html) {
if (err) {
console.log('Failed to fetch content with error: ' + err);
self.onError({error: err, searcher: self});
self.onComplete({searcher: self});
} else {
console.log('Fetched content from... ' + url);
// create DOM window from HTML data
var window = jsdom.jsdom(html).createWindow();
// load jquery with DOM window and call the parser!
jsdom.jQueryify(window, jQueryPath, function() {
self.parseHTML(window);
self.onComplete({searcher: self});
process.exit();
});
}
});
}
// Implemented in inhetired class
Searcher.prototype.getSearchUrl = function(query) {
throw "getSearchUrl() is unimplemented!";
}
// Implemented in inhetired class
Searcher.prototype.parseHTML = function(window) {
throw "parseHTML() is unimplemented!";
}
// Emits 'item' events when an item is found.
Searcher.prototype.onItem = function(item) {
this.emit('item', item);
}
// Emits 'complete' event when searcher is done
Searcher.prototype.onComplete = function(searcher) {
this.emit('complete', searcher);
}
// Emit 'error' events
Searcher.prototype.onError = function(error) {
this.emit('error', error);
}
Searcher.prototype.toString = function() {
return this.id;
}