-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdomain_checker_experiments.py
106 lines (93 loc) · 3.82 KB
/
domain_checker_experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import urllib.request
import urllib.robotparser
import socket
import ssl
# Read domains from domains.txt
with open('domains.txt', 'r') as file:
domains = [domain.strip() for domain in file.readlines()]
# Read excluded domains from excluded.txt
with open('excluded.txt', 'r') as file:
excluded_domains = [domain.strip() for domain in file.readlines()]
# Exclude the domains listed in excluded.txt
domains_to_check = [domain for domain in domains if domain not in excluded_domains]
domains_to_check = set(domains_to_check)
# Create an SSL context that doesn't verify the certificate
ssl_context = ssl._create_unverified_context()
timeout_seconds = 1
socket.setdefaulttimeout(timeout_seconds)
# from reppy.robots import Robots
# Iterate through each domain
for i, domain in enumerate(domains_to_check):
try:
robots_url = f'https://{domain}/robots.txt' # Use HTTPS
print("\n\n> Checking Domain:", domain, robots_url, f"({i})")
rp = urllib.robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
rp_response = rp.can_fetch("gptbot", "/")
print(">> RP_RESPONSE:", rp_response)
for domain in rp.entries:
if 'GPTBot' in domain.useragents:
print(">>>", domain)
# for rule in domain.rulelines:
# print(">>> RULE:", rule)
except:
pass
# import robots
# parser = robots.RobotsParser.from_uri(robots_url)
#
# parser = robots.RobotsParser.from_uri(robots_url)
# useragent = 'GPTBot'
# path = '/'
# rp_response = parser.can_fetch(useragent, path)
# print(">> rp_response:", rp_response)
# agent_result = parser.find_rules('gptbot')
# print(">>> agent_result:", agent_result)
# robots = Robots.fetch(robots_url)
# # robots.allowed(domain, 'GPTBot')
#
# # Get the rules for a specific agent
# agent = robots.agent('GPTBot')
# agent.allowed('http://example.com/some/path/')
# # Fetch the content of robots.txt
# try:
# with urllib.request.urlopen(robots_url, timeout=timeout_seconds, context=ssl_context) as response:
# lines = response.read().decode().splitlines()
#
# # Flags to indicate if we are in the GPTBot section or the wildcard section
# in_gptbot_section = False
# in_wildcard_section = False
# wildcard_rules = []
#
# # Iterate through the lines of robots.txt
# for line in lines:
# line = line.strip()
# if line == "User-agent: GPTBot":
# # print(f"Checking rules for domain: {robots_url}")
# in_gptbot_section = True
# in_wildcard_section = False
# elif line == "User-Agent: *":
# in_wildcard_section = True
# in_gptbot_section = False
# elif line.startswith("User-agent:") and (in_gptbot_section or in_wildcard_section):
# break
# elif in_gptbot_section:
# print(">", line) # Print the Allow/Disallow lines for GPTBot
# elif in_wildcard_section:
# # print(">>", line) # Print the Allow/Disallow lines for GPTBot
# wildcard_rules.append(line)
#
# # If no specific rules for GPTBot were found, print the wildcard rules
# # if not in_gptbot_section and wildcard_rules:
# # print("No specific rules for GPTBot. Applying wildcard rules:")
# # for rule in wildcard_rules:
# # print(rule)
#
# except urllib.error.HTTPError:
# print(f"Failed to fetch robots.txt for {domain}")
# except socket.timeout:
# print("Timeout occurred")
# except Exception as e:
# print("An unexpected error occurred:", e)
#
# # print() # Print a newline for separation between domains