(root)/
Python-3.11.7/
Lib/
test/
test_robotparser.py
       1  import io
       2  import os
       3  import threading
       4  import unittest
       5  import urllib.robotparser
       6  from test import support
       7  from test.support import socket_helper
       8  from test.support import threading_helper
       9  from http.server import BaseHTTPRequestHandler, HTTPServer
      10  
      11  
      12  class ESC[4;38;5;81mBaseRobotTest:
      13      robots_txt = ''
      14      agent = 'test_robotparser'
      15      good = []
      16      bad = []
      17      site_maps = None
      18  
      19      def setUp(self):
      20          lines = io.StringIO(self.robots_txt).readlines()
      21          self.parser = urllib.robotparser.RobotFileParser()
      22          self.parser.parse(lines)
      23  
      24      def get_agent_and_url(self, url):
      25          if isinstance(url, tuple):
      26              agent, url = url
      27              return agent, url
      28          return self.agent, url
      29  
      30      def test_good_urls(self):
      31          for url in self.good:
      32              agent, url = self.get_agent_and_url(url)
      33              with self.subTest(url=url, agent=agent):
      34                  self.assertTrue(self.parser.can_fetch(agent, url))
      35  
      36      def test_bad_urls(self):
      37          for url in self.bad:
      38              agent, url = self.get_agent_and_url(url)
      39              with self.subTest(url=url, agent=agent):
      40                  self.assertFalse(self.parser.can_fetch(agent, url))
      41  
      42      def test_site_maps(self):
      43          self.assertEqual(self.parser.site_maps(), self.site_maps)
      44  
      45  
      46  class ESC[4;38;5;81mUserAgentWildcardTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      47      robots_txt = """\
      48  User-agent: *
      49  Disallow: /cyberworld/map/ # This is an infinite virtual URL space
      50  Disallow: /tmp/ # these will soon disappear
      51  Disallow: /foo.html
      52      """
      53      good = ['/', '/test.html']
      54      bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
      55  
      56  
      57  class ESC[4;38;5;81mCrawlDelayAndCustomAgentTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      58      robots_txt = """\
      59  # robots.txt for http://www.example.com/
      60  
      61  User-agent: *
      62  Crawl-delay: 1
      63  Request-rate: 3/15
      64  Disallow: /cyberworld/map/ # This is an infinite virtual URL space
      65  
      66  # Cybermapper knows where to go.
      67  User-agent: cybermapper
      68  Disallow:
      69      """
      70      good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
      71      bad = ['/cyberworld/map/index.html']
      72  
      73  
      74  class ESC[4;38;5;81mSitemapTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      75      robots_txt = """\
      76  # robots.txt for http://www.example.com/
      77  
      78  User-agent: *
      79  Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
      80  Sitemap: http://www.google.com/hostednews/sitemap_index.xml
      81  Request-rate: 3/15
      82  Disallow: /cyberworld/map/ # This is an infinite virtual URL space
      83  
      84      """
      85      good = ['/', '/test.html']
      86      bad = ['/cyberworld/map/index.html']
      87      site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
      88                   'http://www.google.com/hostednews/sitemap_index.xml']
      89  
      90  
      91  class ESC[4;38;5;81mRejectAllRobotsTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      92      robots_txt = """\
      93  # go away
      94  User-agent: *
      95  Disallow: /
      96      """
      97      good = []
      98      bad = ['/cyberworld/map/index.html', '/', '/tmp/']
      99  
     100  
     101  class ESC[4;38;5;81mBaseRequestRateTest(ESC[4;38;5;149mBaseRobotTest):
     102      request_rate = None
     103      crawl_delay = None
     104  
     105      def test_request_rate(self):
     106          parser = self.parser
     107          for url in self.good + self.bad:
     108              agent, url = self.get_agent_and_url(url)
     109              with self.subTest(url=url, agent=agent):
     110                  self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
     111  
     112                  parsed_request_rate = parser.request_rate(agent)
     113                  self.assertEqual(parsed_request_rate, self.request_rate)
     114                  if self.request_rate is not None:
     115                      self.assertIsInstance(
     116                          parsed_request_rate,
     117                          urllib.robotparser.RequestRate
     118                      )
     119                      self.assertEqual(
     120                          parsed_request_rate.requests,
     121                          self.request_rate.requests
     122                      )
     123                      self.assertEqual(
     124                          parsed_request_rate.seconds,
     125                          self.request_rate.seconds
     126                      )
     127  
     128  
     129  class ESC[4;38;5;81mEmptyFileTest(ESC[4;38;5;149mBaseRequestRateTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     130      robots_txt = ''
     131      good = ['/foo']
     132  
     133  
     134  class ESC[4;38;5;81mCrawlDelayAndRequestRateTest(ESC[4;38;5;149mBaseRequestRateTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     135      robots_txt = """\
     136  User-agent: figtree
     137  Crawl-delay: 3
     138  Request-rate: 9/30
     139  Disallow: /tmp
     140  Disallow: /a%3cd.html
     141  Disallow: /a%2fb.html
     142  Disallow: /%7ejoe/index.html
     143      """
     144      agent = 'figtree'
     145      request_rate = urllib.robotparser.RequestRate(9, 30)
     146      crawl_delay = 3
     147      good = [('figtree', '/foo.html')]
     148      bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
     149             '/a%2fb.html', '/~joe/index.html']
     150  
     151  
     152  class ESC[4;38;5;81mDifferentAgentTest(ESC[4;38;5;149mCrawlDelayAndRequestRateTest):
     153      agent = 'FigTree Robot libwww-perl/5.04'
     154  
     155  
     156  class ESC[4;38;5;81mInvalidRequestRateTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     157      robots_txt = """\
     158  User-agent: *
     159  Disallow: /tmp/
     160  Disallow: /a%3Cd.html
     161  Disallow: /a/b.html
     162  Disallow: /%7ejoe/index.html
     163  Crawl-delay: 3
     164  Request-rate: 9/banana
     165      """
     166      good = ['/tmp']
     167      bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
     168             '/%7Ejoe/index.html']
     169      crawl_delay = 3
     170  
     171  
     172  class ESC[4;38;5;81mInvalidCrawlDelayTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     173      # From bug report #523041
     174      robots_txt = """\
     175  User-Agent: *
     176  Disallow: /.
     177  Crawl-delay: pears
     178      """
     179      good = ['/foo.html']
     180      # bug report says "/" should be denied, but that is not in the RFC
     181      bad = []
     182  
     183  
     184  class ESC[4;38;5;81mAnotherInvalidRequestRateTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     185      # also test that Allow and Diasallow works well with each other
     186      robots_txt = """\
     187  User-agent: Googlebot
     188  Allow: /folder1/myfile.html
     189  Disallow: /folder1/
     190  Request-rate: whale/banana
     191      """
     192      agent = 'Googlebot'
     193      good = ['/folder1/myfile.html']
     194      bad = ['/folder1/anotherfile.html']
     195  
     196  
     197  class ESC[4;38;5;81mUserAgentOrderingTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     198      # the order of User-agent should be correct. note
     199      # that this file is incorrect because "Googlebot" is a
     200      # substring of "Googlebot-Mobile"
     201      robots_txt = """\
     202  User-agent: Googlebot
     203  Disallow: /
     204  
     205  User-agent: Googlebot-Mobile
     206  Allow: /
     207      """
     208      agent = 'Googlebot'
     209      bad = ['/something.jpg']
     210  
     211  
     212  class ESC[4;38;5;81mUserAgentGoogleMobileTest(ESC[4;38;5;149mUserAgentOrderingTest):
     213      agent = 'Googlebot-Mobile'
     214  
     215  
     216  class ESC[4;38;5;81mGoogleURLOrderingTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     217      # Google also got the order wrong. You need
     218      # to specify the URLs from more specific to more general
     219      robots_txt = """\
     220  User-agent: Googlebot
     221  Allow: /folder1/myfile.html
     222  Disallow: /folder1/
     223      """
     224      agent = 'googlebot'
     225      good = ['/folder1/myfile.html']
     226      bad = ['/folder1/anotherfile.html']
     227  
     228  
     229  class ESC[4;38;5;81mDisallowQueryStringTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     230      # see issue #6325 for details
     231      robots_txt = """\
     232  User-agent: *
     233  Disallow: /some/path?name=value
     234      """
     235      good = ['/some/path']
     236      bad = ['/some/path?name=value']
     237  
     238  
     239  class ESC[4;38;5;81mUseFirstUserAgentWildcardTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     240      # obey first * entry (#4108)
     241      robots_txt = """\
     242  User-agent: *
     243  Disallow: /some/path
     244  
     245  User-agent: *
     246  Disallow: /another/path
     247      """
     248      good = ['/another/path']
     249      bad = ['/some/path']
     250  
     251  
     252  class ESC[4;38;5;81mEmptyQueryStringTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     253      # normalize the URL first (#17403)
     254      robots_txt = """\
     255  User-agent: *
     256  Allow: /some/path?
     257  Disallow: /another/path?
     258      """
     259      good = ['/some/path?']
     260      bad = ['/another/path?']
     261  
     262  
     263  class ESC[4;38;5;81mDefaultEntryTest(ESC[4;38;5;149mBaseRequestRateTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     264      robots_txt = """\
     265  User-agent: *
     266  Crawl-delay: 1
     267  Request-rate: 3/15
     268  Disallow: /cyberworld/map/
     269      """
     270      request_rate = urllib.robotparser.RequestRate(3, 15)
     271      crawl_delay = 1
     272      good = ['/', '/test.html']
     273      bad = ['/cyberworld/map/index.html']
     274  
     275  
     276  class ESC[4;38;5;81mStringFormattingTest(ESC[4;38;5;149mBaseRobotTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     277      robots_txt = """\
     278  User-agent: *
     279  Crawl-delay: 1
     280  Request-rate: 3/15
     281  Disallow: /cyberworld/map/ # This is an infinite virtual URL space
     282  
     283  # Cybermapper knows where to go.
     284  User-agent: cybermapper
     285  Disallow: /some/path
     286      """
     287  
     288      expected_output = """\
     289  User-agent: cybermapper
     290  Disallow: /some/path
     291  
     292  User-agent: *
     293  Crawl-delay: 1
     294  Request-rate: 3/15
     295  Disallow: /cyberworld/map/\
     296  """
     297  
     298      def test_string_formatting(self):
     299          self.assertEqual(str(self.parser), self.expected_output)
     300  
     301  
     302  class ESC[4;38;5;81mRobotHandler(ESC[4;38;5;149mBaseHTTPRequestHandler):
     303  
     304      def do_GET(self):
     305          self.send_error(403, "Forbidden access")
     306  
     307      def log_message(self, format, *args):
     308          pass
     309  
     310  
     311  @unittest.skipUnless(
     312      support.has_socket_support,
     313      "Socket server requires working socket."
     314  )
     315  class ESC[4;38;5;81mPasswordProtectedSiteTestCase(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     316  
     317      def setUp(self):
     318          # clear _opener global variable
     319          self.addCleanup(urllib.request.urlcleanup)
     320  
     321          self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
     322  
     323          self.t = threading.Thread(
     324              name='HTTPServer serving',
     325              target=self.server.serve_forever,
     326              # Short poll interval to make the test finish quickly.
     327              # Time between requests is short enough that we won't wake
     328              # up spuriously too many times.
     329              kwargs={'poll_interval':0.01})
     330          self.t.daemon = True  # In case this function raises.
     331          self.t.start()
     332  
     333      def tearDown(self):
     334          self.server.shutdown()
     335          self.t.join()
     336          self.server.server_close()
     337  
     338      @threading_helper.reap_threads
     339      def testPasswordProtectedSite(self):
     340          addr = self.server.server_address
     341          url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
     342          robots_url = url + "/robots.txt"
     343          parser = urllib.robotparser.RobotFileParser()
     344          parser.set_url(url)
     345          parser.read()
     346          self.assertFalse(parser.can_fetch("*", robots_url))
     347  
     348  
     349  @support.requires_working_socket()
     350  class ESC[4;38;5;81mNetworkTestCase(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     351  
     352      base_url = 'http://www.pythontest.net/'
     353      robots_txt = '{}elsewhere/robots.txt'.format(base_url)
     354  
     355      @classmethod
     356      def setUpClass(cls):
     357          support.requires('network')
     358          with socket_helper.transient_internet(cls.base_url):
     359              cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
     360              cls.parser.read()
     361  
     362      def url(self, path):
     363          return '{}{}{}'.format(
     364              self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
     365          )
     366  
     367      def test_basic(self):
     368          self.assertFalse(self.parser.disallow_all)
     369          self.assertFalse(self.parser.allow_all)
     370          self.assertGreater(self.parser.mtime(), 0)
     371          self.assertFalse(self.parser.crawl_delay('*'))
     372          self.assertFalse(self.parser.request_rate('*'))
     373  
     374      def test_can_fetch(self):
     375          self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
     376          self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
     377          self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
     378          self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
     379          self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
     380          self.assertTrue(self.parser.can_fetch('*', self.base_url))
     381  
     382      def test_read_404(self):
     383          parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
     384          parser.read()
     385          self.assertTrue(parser.allow_all)
     386          self.assertFalse(parser.disallow_all)
     387          self.assertEqual(parser.mtime(), 0)
     388          self.assertIsNone(parser.crawl_delay('*'))
     389          self.assertIsNone(parser.request_rate('*'))
     390  
     391  if __name__=='__main__':
     392      unittest.main()