Skip to content

Commit

Permalink
Merge pull request #234 from robcowie/google-web-preview-bot
Browse files Browse the repository at this point in the history
Classify Google Web Preview as a Spider
  • Loading branch information
commenthol authored May 22, 2017
2 parents 88f9f27 + 075d55f commit cbd8c65
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 1 deletion.
2 changes: 1 addition & 1 deletion regexes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4883,7 +4883,7 @@ device_parsers:
##########
# Spiders (this is hack...)
##########
- regex: '(bot|zao|borg|DBot|oegp|silk|Xenu|zeal|^NING|CCBot|crawl|htdig|lycos|slurp|teoma|voila|yahoo|Sogou|CiBra|Nutch|^Java/|^JNLP/|Daumoa|Genieo|ichiro|larbin|pompos|Scrapy|snappy|speedy|spider|msnbot|msrbot|vortex|^vortex|crawler|favicon|indexer|Riddler|scooter|scraper|scrubby|WhatWeb|WinHTTP|bingbot|BingPreview|openbot|gigabot|furlbot|polybot|seekbot|^voyager|archiver|Icarus6j|mogimogi|Netvibes|blitzbot|altavista|charlotte|findlinks|Retreiver|TLSProber|WordPress|SeznamBot|ProoXiBot|wsr\-agent|Squrl Java|EtaoSpider|PaperLiBot|SputnikBot|A6\-Indexer|netresearch|searchsight|baiduspider|YisouSpider|ICC\-Crawler|http%20client|Python-urllib|dataparksearch|converacrawler|Screaming Frog|AppEngine-Google|YahooCacheSystem|fast\-webcrawler|Sogou Pic Spider|semanticdiscovery|Innovazion Crawler|facebookexternalhit|Google.*/\+/web/snippet|Google-HTTP-Java-Client|BlogBridge|IlTrovatore-Setaccio|InternetArchive|GomezAgent|WebThumbnail|heritrix|NewsGator|PagePeeker|Reaper|ZooShot|holmes|NL-Crawler|Pingdom|StatusCake|WhatsApp|masscan)'
- regex: '(bot|zao|borg|DBot|oegp|silk|Xenu|zeal|^NING|CCBot|crawl|htdig|lycos|slurp|teoma|voila|yahoo|Sogou|CiBra|Nutch|^Java/|^JNLP/|Daumoa|Genieo|ichiro|larbin|pompos|Scrapy|snappy|speedy|spider|msnbot|msrbot|vortex|^vortex|crawler|favicon|indexer|Riddler|scooter|scraper|scrubby|WhatWeb|WinHTTP|bingbot|BingPreview|openbot|gigabot|furlbot|polybot|seekbot|^voyager|archiver|Icarus6j|mogimogi|Netvibes|blitzbot|altavista|charlotte|findlinks|Retreiver|TLSProber|WordPress|SeznamBot|ProoXiBot|wsr\-agent|Squrl Java|EtaoSpider|PaperLiBot|SputnikBot|A6\-Indexer|netresearch|searchsight|baiduspider|YisouSpider|ICC\-Crawler|http%20client|Python-urllib|dataparksearch|converacrawler|Screaming Frog|AppEngine-Google|YahooCacheSystem|fast\-webcrawler|Sogou Pic Spider|semanticdiscovery|Innovazion Crawler|facebookexternalhit|Google.*/\+/web/snippet|Google-HTTP-Java-Client|BlogBridge|IlTrovatore-Setaccio|InternetArchive|GomezAgent|WebThumbnail|heritrix|NewsGator|PagePeeker|Reaper|ZooShot|holmes|NL-Crawler|Pingdom|StatusCake|WhatsApp|masscan|Google Web Preview)'
regex_flag: 'i'
device_replacement: 'Spider'
brand_replacement: 'Spider'
Expand Down
9 changes: 9 additions & 0 deletions tests/test_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79975,3 +79975,12 @@ test_cases:
brand: 'Huawei'
model: 'EVA-AL10'

- user_agent_string: 'Google Web Preview'
family: 'Spider'
brand: 'Spider'
model: 'Desktop'

- user_agent_string: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko; Google Web Preview) Chrome/27.0 .1453 Safari/537.36.'
family: 'Spider'
brand: 'Spider'
model: 'Desktop'
7 changes: 7 additions & 0 deletions tests/test_os.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,13 @@ test_cases:
patch:
patch_minor:

- user_agent_string: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko; Google Web Preview) Chrome/27.0 .1453 Safari/537.36.'
family: 'Linux'
major:
minor:
patch:
patch_minor:

- user_agent_string: 'Bunjalloo/0.7.6(Nintendo DS;U;en)'
family: 'Other'
major:
Expand Down
12 changes: 12 additions & 0 deletions tests/test_ua.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2343,6 +2343,18 @@ test_cases:
minor:
patch:

- user_agent_string: 'Google Web Preview'
family: 'Other'
major:
minor:
patch:

- user_agent_string: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko; Google Web Preview) Chrome/27.0 .1453 Safari/537.36.'
family: 'Chrome'
major: '27'
minor: '0'
patch:

- user_agent_string: 'HiddenMarket-1.0-beta (www.hiddenmarket.net/crawler.php)'
family: 'HiddenMarket'
major:
Expand Down

0 comments on commit cbd8c65

Please sign in to comment.