1  """ robotparser.py
       2  
       3      Copyright (C) 2000  Bastian Kleineidam
       4  
       5      You can choose between two licenses when using this package:
       6      1) GNU GPLv2
       7      2) PSF license for Python 2.2
       8  
       9      The robots.txt Exclusion Protocol is implemented as specified in
      10      http://www.robotstxt.org/norobots-rfc.txt
      11  """
      12  
      13  import collections
      14  import urllib.parse
      15  import urllib.request
      16  
      17  __all__ = ["RobotFileParser"]
      18  
      19  RequestRate = collections.namedtuple("RequestRate", "requests seconds")
      20  
      21  
      22  class ESC[4;38;5;81mRobotFileParser:
      23      """ This class provides a set of methods to read, parse and answer
      24      questions about a single robots.txt file.
      25  
      26      """
      27  
      28      def __init__(self, url=''):
      29          self.entries = []
      30          self.sitemaps = []
      31          self.default_entry = None
      32          self.disallow_all = False
      33          self.allow_all = False
      34          self.set_url(url)
      35          self.last_checked = 0
      36  
      37      def mtime(self):
      38          """Returns the time the robots.txt file was last fetched.
      39  
      40          This is useful for long-running web spiders that need to
      41          check for new robots.txt files periodically.
      42  
      43          """
      44          return self.last_checked
      45  
      46      def modified(self):
      47          """Sets the time the robots.txt file was last fetched to the
      48          current time.
      49  
      50          """
      51          import time
      52          self.last_checked = time.time()
      53  
      54      def set_url(self, url):
      55          """Sets the URL referring to a robots.txt file."""
      56          self.url = url
      57          self.host, self.path = urllib.parse.urlparse(url)[1:3]
      58  
      59      def read(self):
      60          """Reads the robots.txt URL and feeds it to the parser."""
      61          try:
      62              f = urllib.request.urlopen(self.url)
      63          except urllib.error.HTTPError as err:
      64              if err.code in (401, 403):
      65                  self.disallow_all = True
      66              elif err.code >= 400 and err.code < 500:
      67                  self.allow_all = True
      68          else:
      69              raw = f.read()
      70              self.parse(raw.decode("utf-8").splitlines())
      71  
      72      def _add_entry(self, entry):
      73          if "*" in entry.useragents:
      74              # the default entry is considered last
      75              if self.default_entry is None:
      76                  # the first default entry wins
      77                  self.default_entry = entry
      78          else:
      79              self.entries.append(entry)
      80  
      81      def parse(self, lines):
      82          """Parse the input lines from a robots.txt file.
      83  
      84          We allow that a user-agent: line is not preceded by
      85          one or more blank lines.
      86          """
      87          # states:
      88          #   0: start state
      89          #   1: saw user-agent line
      90          #   2: saw an allow or disallow line
      91          state = 0
      92          entry = Entry()
      93  
      94          self.modified()
      95          for line in lines:
      96              if not line:
      97                  if state == 1:
      98                      entry = Entry()
      99                      state = 0
     100                  elif state == 2:
     101                      self._add_entry(entry)
     102                      entry = Entry()
     103                      state = 0
     104              # remove optional comment and strip line
     105              i = line.find('#')
     106              if i >= 0:
     107                  line = line[:i]
     108              line = line.strip()
     109              if not line:
     110                  continue
     111              line = line.split(':', 1)
     112              if len(line) == 2:
     113                  line[0] = line[0].strip().lower()
     114                  line[1] = urllib.parse.unquote(line[1].strip())
     115                  if line[0] == "user-agent":
     116                      if state == 2:
     117                          self._add_entry(entry)
     118                          entry = Entry()
     119                      entry.useragents.append(line[1])
     120                      state = 1
     121                  elif line[0] == "disallow":
     122                      if state != 0:
     123                          entry.rulelines.append(RuleLine(line[1], False))
     124                          state = 2
     125                  elif line[0] == "allow":
     126                      if state != 0:
     127                          entry.rulelines.append(RuleLine(line[1], True))
     128                          state = 2
     129                  elif line[0] == "crawl-delay":
     130                      if state != 0:
     131                          # before trying to convert to int we need to make
     132                          # sure that robots.txt has valid syntax otherwise
     133                          # it will crash
     134                          if line[1].strip().isdigit():
     135                              entry.delay = int(line[1])
     136                          state = 2
     137                  elif line[0] == "request-rate":
     138                      if state != 0:
     139                          numbers = line[1].split('/')
     140                          # check if all values are sane
     141                          if (len(numbers) == 2 and numbers[0].strip().isdigit()
     142                              and numbers[1].strip().isdigit()):
     143                              entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
     144                          state = 2
     145                  elif line[0] == "sitemap":
     146                      # According to http://www.sitemaps.org/protocol.html
     147                      # "This directive is independent of the user-agent line,
     148                      #  so it doesn't matter where you place it in your file."
     149                      # Therefore we do not change the state of the parser.
     150                      self.sitemaps.append(line[1])
     151          if state == 2:
     152              self._add_entry(entry)
     153  
     154      def can_fetch(self, useragent, url):
     155          """using the parsed robots.txt decide if useragent can fetch url"""
     156          if self.disallow_all:
     157              return False
     158          if self.allow_all:
     159              return True
     160          # Until the robots.txt file has been read or found not
     161          # to exist, we must assume that no url is allowable.
     162          # This prevents false positives when a user erroneously
     163          # calls can_fetch() before calling read().
     164          if not self.last_checked:
     165              return False
     166          # search for given user agent matches
     167          # the first match counts
     168          parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
     169          url = urllib.parse.urlunparse(('','',parsed_url.path,
     170              parsed_url.params,parsed_url.query, parsed_url.fragment))
     171          url = urllib.parse.quote(url)
     172          if not url:
     173              url = "/"
     174          for entry in self.entries:
     175              if entry.applies_to(useragent):
     176                  return entry.allowance(url)
     177          # try the default entry last
     178          if self.default_entry:
     179              return self.default_entry.allowance(url)
     180          # agent not found ==> access granted
     181          return True
     182  
     183      def crawl_delay(self, useragent):
     184          if not self.mtime():
     185              return None
     186          for entry in self.entries:
     187              if entry.applies_to(useragent):
     188                  return entry.delay
     189          if self.default_entry:
     190              return self.default_entry.delay
     191          return None
     192  
     193      def request_rate(self, useragent):
     194          if not self.mtime():
     195              return None
     196          for entry in self.entries:
     197              if entry.applies_to(useragent):
     198                  return entry.req_rate
     199          if self.default_entry:
     200              return self.default_entry.req_rate
     201          return None
     202  
     203      def site_maps(self):
     204          if not self.sitemaps:
     205              return None
     206          return self.sitemaps
     207  
     208      def __str__(self):
     209          entries = self.entries
     210          if self.default_entry is not None:
     211              entries = entries + [self.default_entry]
     212          return '\n\n'.join(map(str, entries))
     213  
     214  
     215  class ESC[4;38;5;81mRuleLine:
     216      """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
     217         (allowance==False) followed by a path."""
     218      def __init__(self, path, allowance):
     219          if path == '' and not allowance:
     220              # an empty value means allow all
     221              allowance = True
     222          path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
     223          self.path = urllib.parse.quote(path)
     224          self.allowance = allowance
     225  
     226      def applies_to(self, filename):
     227          return self.path == "*" or filename.startswith(self.path)
     228  
     229      def __str__(self):
     230          return ("Allow" if self.allowance else "Disallow") + ": " + self.path
     231  
     232  
     233  class ESC[4;38;5;81mEntry:
     234      """An entry has one or more user-agents and zero or more rulelines"""
     235      def __init__(self):
     236          self.useragents = []
     237          self.rulelines = []
     238          self.delay = None
     239          self.req_rate = None
     240  
     241      def __str__(self):
     242          ret = []
     243          for agent in self.useragents:
     244              ret.append(f"User-agent: {agent}")
     245          if self.delay is not None:
     246              ret.append(f"Crawl-delay: {self.delay}")
     247          if self.req_rate is not None:
     248              rate = self.req_rate
     249              ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
     250          ret.extend(map(str, self.rulelines))
     251          return '\n'.join(ret)
     252  
     253      def applies_to(self, useragent):
     254          """check if this entry applies to the specified agent"""
     255          # split the name token and make it lower case
     256          useragent = useragent.split("/")[0].lower()
     257          for agent in self.useragents:
     258              if agent == '*':
     259                  # we have the catch-all agent
     260                  return True
     261              agent = agent.lower()
     262              if agent in useragent:
     263                  return True
     264          return False
     265  
     266      def allowance(self, filename):
     267          """Preconditions:
     268          - our agent applies to this entry
     269          - filename is URL decoded"""
     270          for line in self.rulelines:
     271              if line.applies_to(filename):
     272                  return line.allowance
     273          return True