1 import io
2 import os
3 import threading
4 import unittest
5 import urllib.robotparser
6 from test import support
7 from test.support import socket_helper
8 from test.support import threading_helper
9 from http.server import BaseHTTPRequestHandler, HTTPServer
10
11
12 class ESC[4;38;5;81mBaseRobotTest:
13 robots_txt = ''
14 agent = 'test_robotparser'
15 good = []
16 bad = []
17 site_maps = None
18
19 def setUp(self):
20 lines = io.StringIO(self.robots_txt).readlines()
21 self.parser = urllib.robotparser.RobotFileParser()
22 self.parser.parse(lines)
23
24 def get_agent_and_url(self, url):
25 if isinstance(url, tuple):
26 agent, url = url
27 return agent, url
28 return self.agent, url
29
30 def test_good_urls(self):
31 for url in self.good:
32 agent, url = self.get_agent_and_url(url)
33 with self.subTest(url=url, agent=agent):
34 self.assertTrue(self.parser.can_fetch(agent, url))
35
36 def test_bad_urls(self):
37 for url in self.bad:
38 agent, url = self.get_agent_and_url(url)
39 with self.subTest(url=url, agent=agent):
40 self.assertFalse(self.parser.can_fetch(agent, url))
41
42 def test_site_maps(self):
43 self.assertEqual(self.parser.site_maps(), self.site_maps)
44
45
46 class ESC[4;38;5;81mUserAgentWildcardTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
47 robots_txt = """\
48 User-agent: *
49 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
50 Disallow: /tmp/ # these will soon disappear
51 Disallow: /foo.html
52 """
53 good = ['/', '/test.html']
54 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
55
56
57 class ESC[4;38;5;81mCrawlDelayAndCustomAgentTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
58 robots_txt = """\
59 # robots.txt for http://www.example.com/
60
61 User-agent: *
62 Crawl-delay: 1
63 Request-rate: 3/15
64 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
65
66 # Cybermapper knows where to go.
67 User-agent: cybermapper
68 Disallow:
69 """
70 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
71 bad = ['/cyberworld/map/index.html']
72
73
74 class ESC[4;38;5;81mSitemapTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
75 robots_txt = """\
76 # robots.txt for http://www.example.com/
77
78 User-agent: *
79 Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
80 Sitemap: http://www.google.com/hostednews/sitemap_index.xml
81 Request-rate: 3/15
82 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
83
84 """
85 good = ['/', '/test.html']
86 bad = ['/cyberworld/map/index.html']
87 site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
88 'http://www.google.com/hostednews/sitemap_index.xml']
89
90
91 class ESC[4;38;5;81mRejectAllRobotsTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
92 robots_txt = """\
93 # go away
94 User-agent: *
95 Disallow: /
96 """
97 good = []
98 bad = ['/cyberworld/map/index.html', '/', '/tmp/']
99
100
101 class ESC[4;38;5;81mBaseRequestRateTest(ESC[4;38;5;149mBaseRobotTest):
102 request_rate = None
103 crawl_delay = None
104
105 def test_request_rate(self):
106 parser = self.parser
107 for url in self.good + self.bad:
108 agent, url = self.get_agent_and_url(url)
109 with self.subTest(url=url, agent=agent):
110 self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
111
112 parsed_request_rate = parser.request_rate(agent)
113 self.assertEqual(parsed_request_rate, self.request_rate)
114 if self.request_rate is not None:
115 self.assertIsInstance(
116 parsed_request_rate,
117 urllib.robotparser.RequestRate
118 )
119 self.assertEqual(
120 parsed_request_rate.requests,
121 self.request_rate.requests
122 )
123 self.assertEqual(
124 parsed_request_rate.seconds,
125 self.request_rate.seconds
126 )
127
128
129 class ESC[4;38;5;81mEmptyFileTest(ESC[4;38;5;149mBaseRequestRateTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
130 robots_txt = ''
131 good = ['/foo']
132
133
134 class ESC[4;38;5;81mCrawlDelayAndRequestRateTest(ESC[4;38;5;149mBaseRequestRateTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
135 robots_txt = """\
136 User-agent: figtree
137 Crawl-delay: 3
138 Request-rate: 9/30
139 Disallow: /tmp
140 Disallow: /a%3cd.html
141 Disallow: /a%2fb.html
142 Disallow: /%7ejoe/index.html
143 """
144 agent = 'figtree'
145 request_rate = urllib.robotparser.RequestRate(9, 30)
146 crawl_delay = 3
147 good = [('figtree', '/foo.html')]
148 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
149 '/a%2fb.html', '/~joe/index.html']
150
151
152 class ESC[4;38;5;81mDifferentAgentTest(ESC[4;38;5;149mCrawlDelayAndRequestRateTest):
153 agent = 'FigTree Robot libwww-perl/5.04'
154
155
156 class ESC[4;38;5;81mInvalidRequestRateTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
157 robots_txt = """\
158 User-agent: *
159 Disallow: /tmp/
160 Disallow: /a%3Cd.html
161 Disallow: /a/b.html
162 Disallow: /%7ejoe/index.html
163 Crawl-delay: 3
164 Request-rate: 9/banana
165 """
166 good = ['/tmp']
167 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
168 '/%7Ejoe/index.html']
169 crawl_delay = 3
170
171
172 class ESC[4;38;5;81mInvalidCrawlDelayTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
173 # From bug report #523041
174 robots_txt = """\
175 User-Agent: *
176 Disallow: /.
177 Crawl-delay: pears
178 """
179 good = ['/foo.html']
180 # bug report says "/" should be denied, but that is not in the RFC
181 bad = []
182
183
184 class ESC[4;38;5;81mAnotherInvalidRequestRateTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
185 # also test that Allow and Diasallow works well with each other
186 robots_txt = """\
187 User-agent: Googlebot
188 Allow: /folder1/myfile.html
189 Disallow: /folder1/
190 Request-rate: whale/banana
191 """
192 agent = 'Googlebot'
193 good = ['/folder1/myfile.html']
194 bad = ['/folder1/anotherfile.html']
195
196
197 class ESC[4;38;5;81mUserAgentOrderingTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
198 # the order of User-agent should be correct. note
199 # that this file is incorrect because "Googlebot" is a
200 # substring of "Googlebot-Mobile"
201 robots_txt = """\
202 User-agent: Googlebot
203 Disallow: /
204
205 User-agent: Googlebot-Mobile
206 Allow: /
207 """
208 agent = 'Googlebot'
209 bad = ['/something.jpg']
210
211
212 class ESC[4;38;5;81mUserAgentGoogleMobileTest(ESC[4;38;5;149mUserAgentOrderingTest):
213 agent = 'Googlebot-Mobile'
214
215
216 class ESC[4;38;5;81mGoogleURLOrderingTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
217 # Google also got the order wrong. You need
218 # to specify the URLs from more specific to more general
219 robots_txt = """\
220 User-agent: Googlebot
221 Allow: /folder1/myfile.html
222 Disallow: /folder1/
223 """
224 agent = 'googlebot'
225 good = ['/folder1/myfile.html']
226 bad = ['/folder1/anotherfile.html']
227
228
229 class ESC[4;38;5;81mDisallowQueryStringTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
230 # see issue #6325 for details
231 robots_txt = """\
232 User-agent: *
233 Disallow: /some/path?name=value
234 """
235 good = ['/some/path']
236 bad = ['/some/path?name=value']
237
238
239 class ESC[4;38;5;81mUseFirstUserAgentWildcardTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
240 # obey first * entry (#4108)
241 robots_txt = """\
242 User-agent: *
243 Disallow: /some/path
244
245 User-agent: *
246 Disallow: /another/path
247 """
248 good = ['/another/path']
249 bad = ['/some/path']
250
251
252 class ESC[4;38;5;81mEmptyQueryStringTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
253 # normalize the URL first (#17403)
254 robots_txt = """\
255 User-agent: *
256 Allow: /some/path?
257 Disallow: /another/path?
258 """
259 good = ['/some/path?']
260 bad = ['/another/path?']
261
262
263 class ESC[4;38;5;81mDefaultEntryTest(ESC[4;38;5;149mBaseRequestRateTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
264 robots_txt = """\
265 User-agent: *
266 Crawl-delay: 1
267 Request-rate: 3/15
268 Disallow: /cyberworld/map/
269 """
270 request_rate = urllib.robotparser.RequestRate(3, 15)
271 crawl_delay = 1
272 good = ['/', '/test.html']
273 bad = ['/cyberworld/map/index.html']
274
275
276 class ESC[4;38;5;81mStringFormattingTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
277 robots_txt = """\
278 User-agent: *
279 Crawl-delay: 1
280 Request-rate: 3/15
281 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
282
283 # Cybermapper knows where to go.
284 User-agent: cybermapper
285 Disallow: /some/path
286 """
287
288 expected_output = """\
289 User-agent: cybermapper
290 Disallow: /some/path
291
292 User-agent: *
293 Crawl-delay: 1
294 Request-rate: 3/15
295 Disallow: /cyberworld/map/\
296 """
297
298 def test_string_formatting(self):
299 self.assertEqual(str(self.parser), self.expected_output)
300
301
302 class ESC[4;38;5;81mRobotHandler(ESC[4;38;5;149mBaseHTTPRequestHandler):
303
304 def do_GET(self):
305 self.send_error(403, "Forbidden access")
306
307 def log_message(self, format, *args):
308 pass
309
310
311 @unittest.skipUnless(
312 support.has_socket_support,
313 "Socket server requires working socket."
314 )
315 class ESC[4;38;5;81mPasswordProtectedSiteTestCase(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
316
317 def setUp(self):
318 # clear _opener global variable
319 self.addCleanup(urllib.request.urlcleanup)
320
321 self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
322
323 self.t = threading.Thread(
324 name='HTTPServer serving',
325 target=self.server.serve_forever,
326 # Short poll interval to make the test finish quickly.
327 # Time between requests is short enough that we won't wake
328 # up spuriously too many times.
329 kwargs={'poll_interval':0.01})
330 self.t.daemon = True # In case this function raises.
331 self.t.start()
332
333 def tearDown(self):
334 self.server.shutdown()
335 self.t.join()
336 self.server.server_close()
337
338 @threading_helper.reap_threads
339 def testPasswordProtectedSite(self):
340 addr = self.server.server_address
341 url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
342 robots_url = url + "/robots.txt"
343 parser = urllib.robotparser.RobotFileParser()
344 parser.set_url(url)
345 parser.read()
346 self.assertFalse(parser.can_fetch("*", robots_url))
347
348
349 @support.requires_working_socket()
350 class ESC[4;38;5;81mNetworkTestCase(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
351
352 base_url = 'http://www.pythontest.net/'
353 robots_txt = '{}elsewhere/robots.txt'.format(base_url)
354
355 @classmethod
356 def setUpClass(cls):
357 support.requires('network')
358 with socket_helper.transient_internet(cls.base_url):
359 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
360 cls.parser.read()
361
362 def url(self, path):
363 return '{}{}{}'.format(
364 self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
365 )
366
367 def test_basic(self):
368 self.assertFalse(self.parser.disallow_all)
369 self.assertFalse(self.parser.allow_all)
370 self.assertGreater(self.parser.mtime(), 0)
371 self.assertFalse(self.parser.crawl_delay('*'))
372 self.assertFalse(self.parser.request_rate('*'))
373
374 def test_can_fetch(self):
375 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
376 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
377 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
378 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
379 self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
380 self.assertTrue(self.parser.can_fetch('*', self.base_url))
381
382 def test_read_404(self):
383 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
384 parser.read()
385 self.assertTrue(parser.allow_all)
386 self.assertFalse(parser.disallow_all)
387 self.assertEqual(parser.mtime(), 0)
388 self.assertIsNone(parser.crawl_delay('*'))
389 self.assertIsNone(parser.request_rate('*'))
390
391 if __name__=='__main__':
392 unittest.main()