-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.js
146 lines (118 loc) · 6.11 KB
/
app.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
const puppeteer = require('puppeteer-extra');
const cheerio = require('cheerio');
const converter = require('json-2-csv');
const fs = require("node:fs");
const readline = require('readline');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
(async() => {
console.clear();
console.log(`Google Maps Scraper\nCreated by @zxkjwa (discord)\n------------------------`)
rl.question('Enter a string to search for: ', async (searchString) => {
rl.close();
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
const start_time = new Date();
await page.goto(`https://www.google.com/localservices/prolist?hl=en-GB&gl=uk&ssta=1&q=${encodeURIComponent(searchString)}&oq=${encodeURIComponent(searchString)}&src=2`);
const acceptAllButton = await page.$('button[aria-label="Accept all"]');
if (acceptAllButton) {
await acceptAllButton.click();
};
await page.waitForTimeout(3000);
let scrapeData = [];
const getPageData = async() => {
let cards = await page.evaluate(async() => {
const organicCards = Array.from(document.querySelectorAll('div[data-test-id="organic-list-card"]'));
let cardData = [];
for(const card of organicCards) {
try {
await card.querySelector('div[role="button"] > div:first-of-type').click();
await new Promise(resolve => setTimeout(() => resolve(), 1000));
const name = document.querySelector(".tZPcob") ? document.querySelector(".tZPcob").innerText : "NONE";
const phoneNumber = document.querySelector('[data-phone-number][role="button"][class*=" "]') ? document.querySelector('[data-phone-number][role="button"][class*=" "]').querySelector("div:last-of-type").innerHTML : "NONE";
const website = document.querySelector(".iPF7ob > div:last-of-type") ? document.querySelector(".iPF7ob > div:last-of-type").innerHTML : "NONE";
const address = document.querySelector(".fccl3c") ? document.querySelector(".fccl3c").innerText : "NONE";
cardData.push({
name,
address,
phone: phoneNumber == "NONE" ? phoneNumber : phoneNumber,
website,
});
} catch(e) {
console.log(e);
}
};
return cardData;
});
cards = await Promise.all(await cards.map(async c => {
if(c.website == "NONE" || !c.website) return c;
try {
let websiteURL = c.website.includes("http") ? c.website: `https://${c.website}`;
// console.log(`scraping ${websiteURL}`);
const time_start = new Date();
const websiteContent = await fetch(websiteURL);
// console.log(`${websiteURL}\nHTTP Status: ${websiteContent.status}/${websiteContent.statusText}`);
let responseTime = (Date.now() - time_start.getTime()) / 1000;
const websiteHTML = await websiteContent.text();
const copyrightYears = extractCopyrightYear(websiteHTML);
c.copyright_year = copyrightYears.length > 0 ? copyrightYears[0] : null;
c.response_time = `${responseTime}s`;
return c;
} catch(e) {
// console.log(`${websiteURL}\nRequest Failed`);
c.copyright_year = null;
c.response_time = "failed";
return c;
};
}));
console.log(`[data] Succesfully scraped ${cards.length} records, continuing to the next page if it's available`);
scrapeData = scrapeData.concat(cards);
const nextButton = await page.$('button[aria-label="Next"]');
if(nextButton) {
try {
await nextButton.click();
await page.waitForTimeout(5000);
await getPageData();
} catch (e) {
const csv = await converter.json2csv(scrapeData);
fs.writeFileSync(`output-${(Math.random() + 1).toString(36).substring(7)}.csv`, csv, "utf-8");
console.log(`[+] Records saved to CSV file`);
console.log(`[success] Scraped ${scrapeData.length} records in ${(Date.now() - start_time.getTime()) / 1000}s`);
}
} else {
const csv = await converter.json2csv(scrapeData);
fs.writeFileSync(`output-${(Math.random() + 1).toString(36).substring(7)}.csv`, csv, "utf-8");
console.log(`[+] Records saved to CSV file`);
console.log(`[success] Scraped ${scrapeData.length} records in ${(Date.now() - start_time.getTime()) / 1000}s`);
};
};
await getPageData();
});
})();
function extractCopyrightYear(html) {
const $ = cheerio.load(html);
const copyrightDivs = $('div').filter((index, element) => {
const divText = $(element).text();
return /Copyright|©/.test(divText);
});
const copyrightYears = [];
copyrightDivs.each((index, element) => {
const divText = $(element).text();
if(divText.length > 400) return;
if(!divText.toLowerCase().includes("copyright") && !divText.toLowerCase().includes("©")) return;
const years = divText.match(/\b\d{4}\b/g);
if (years) {
years.forEach((year) => {
const yearInt = parseInt(year);
if (!isNaN(yearInt)) {
copyrightYears.push(yearInt);
}
});
}
});
return copyrightYears;
};