1  """
       2  Try to detect suspicious constructs, resembling markup
       3  that has leaked into the final output.
       4  
       5  Suspicious lines are reported in a comma-separated-file,
       6  ``suspicious.csv``, located in the output directory.
       7  
       8  The file is utf-8 encoded, and each line contains four fields:
       9  
      10   * document name (normalized)
      11   * line number in the source document
      12   * problematic text
      13   * complete line showing the problematic text in context
      14  
      15  It is common to find many false positives. To avoid reporting them
      16  again and again, they may be added to the ``ignored.csv`` file
      17  (located in the configuration directory). The file has the same
      18  format as ``suspicious.csv`` with a few differences:
      19  
      20    - each line defines a rule; if the rule matches, the issue
      21      is ignored.
      22    - line number may be empty (that is, nothing between the
      23      commas: ",,"). In this case, line numbers are ignored (the
      24      rule matches anywhere in the file).
      25    - the last field does not have to be a complete line; some
      26      surrounding text (never more than a line) is enough for
      27      context.
      28  
      29  Rules are processed sequentially. A rule matches when:
      30  
      31   * document names are the same
      32   * problematic texts are the same
      33   * line numbers are close to each other (5 lines up or down)
      34   * the rule text is completely contained into the source line
      35  
      36  The simplest way to create the ignored.csv file is by copying
      37  undesired entries from suspicious.csv (possibly trimming the last
      38  field.)
      39  
      40  Copyright 2009 Gabriel A. Genellina
      41  
      42  """
      43  
      44  import os
      45  import re
      46  import csv
      47  
      48  from docutils import nodes
      49  from sphinx.builders import Builder
      50  import sphinx.util
      51  
      52  detect_all = re.compile(r'''
      53      ::(?=[^=])|            # two :: (but NOT ::=)
      54      :[a-zA-Z][a-zA-Z0-9]+| # :foo
      55      `|                     # ` (seldom used by itself)
      56      (?<!\.)\.\.[ \t]*\w+:  # .. foo: (but NOT ... else:)
      57      ''', re.VERBOSE).finditer
      58  
      59  
      60  class ESC[4;38;5;81mRule:
      61      def __init__(self, docname, lineno, issue, line):
      62          """A rule for ignoring issues"""
      63          self.docname = docname # document to which this rule applies
      64          self.lineno = lineno   # line number in the original source;
      65                                 # this rule matches only near that.
      66                                 # None -> don't care
      67          self.issue = issue     # the markup fragment that triggered this rule
      68          self.line = line       # text of the container element (single line only)
      69          self.used = False
      70  
      71      def __repr__(self):
      72          return '{0.docname},,{0.issue},{0.line}'.format(self)
      73  
      74  
      75  
      76  class ESC[4;38;5;81mdialect(ESC[4;38;5;149mcsvESC[4;38;5;149m.ESC[4;38;5;149mexcel):
      77      """Our dialect: uses only linefeed as newline."""
      78      lineterminator = '\n'
      79  
      80  
      81  class ESC[4;38;5;81mCheckSuspiciousMarkupBuilder(ESC[4;38;5;149mBuilder):
      82      """
      83      Checks for possibly invalid markup that may leak into the output.
      84      """
      85      name = 'suspicious'
      86      logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder")
      87  
      88      def init(self):
      89          # create output file
      90          self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
      91          open(self.log_file_name, 'w').close()
      92          # load database of previously ignored issues
      93          self.load_rules(os.path.join(os.path.dirname(__file__), '..',
      94                                       'susp-ignored.csv'))
      95  
      96      def get_outdated_docs(self):
      97          return self.env.found_docs
      98  
      99      def get_target_uri(self, docname, typ=None):
     100          return ''
     101  
     102      def prepare_writing(self, docnames):
     103          pass
     104  
     105      def write_doc(self, docname, doctree):
     106          # set when any issue is encountered in this document
     107          self.any_issue = False
     108          self.docname = docname
     109          visitor = SuspiciousVisitor(doctree, self)
     110          doctree.walk(visitor)
     111  
     112      def finish(self):
     113          unused_rules = [rule for rule in self.rules if not rule.used]
     114          if unused_rules:
     115              self.logger.warning(
     116                  'Found %s/%s unused rules: %s' % (
     117                      len(unused_rules), len(self.rules),
     118                      '\n'.join(repr(rule) for rule in unused_rules),
     119                  )
     120              )
     121          return
     122  
     123      def check_issue(self, line, lineno, issue):
     124          if not self.is_ignored(line, lineno, issue):
     125              self.report_issue(line, lineno, issue)
     126  
     127      def is_ignored(self, line, lineno, issue):
     128          """Determine whether this issue should be ignored."""
     129          docname = self.docname
     130          for rule in self.rules:
     131              if rule.docname != docname: continue
     132              if rule.issue != issue: continue
     133              # Both lines must match *exactly*. This is rather strict,
     134              # and probably should be improved.
     135              # Doing fuzzy matches with levenshtein distance could work,
     136              # but that means bringing other libraries...
     137              # Ok, relax that requirement: just check if the rule fragment
     138              # is contained in the document line
     139              if rule.line not in line: continue
     140              # Check both line numbers. If they're "near"
     141              # this rule matches. (lineno=None means "don't care")
     142              if (rule.lineno is not None) and \
     143                  abs(rule.lineno - lineno) > 5: continue
     144              # if it came this far, the rule matched
     145              rule.used = True
     146              return True
     147          return False
     148  
     149      def report_issue(self, text, lineno, issue):
     150          self.any_issue = True
     151          self.write_log_entry(lineno, issue, text)
     152          self.logger.warning('[%s:%d] "%s" found in "%-.120s"' %
     153                                  (self.docname, lineno, issue, text))
     154          self.app.statuscode = 1
     155  
     156      def write_log_entry(self, lineno, issue, text):
     157          f = open(self.log_file_name, 'a')
     158          writer = csv.writer(f, dialect)
     159          writer.writerow([self.docname, lineno, issue, text.strip()])
     160          f.close()
     161  
     162      def load_rules(self, filename):
     163          """Load database of previously ignored issues.
     164  
     165          A csv file, with exactly the same format as suspicious.csv
     166          Fields: document name (normalized), line number, issue, surrounding text
     167          """
     168          self.logger.info("loading ignore rules... ", nonl=1)
     169          self.rules = rules = []
     170          try:
     171              f = open(filename, 'r')
     172          except IOError:
     173              return
     174          for i, row in enumerate(csv.reader(f)):
     175              if len(row) != 4:
     176                  raise ValueError(
     177                      "wrong format in %s, line %d: %s" % (filename, i+1, row))
     178              docname, lineno, issue, text = row
     179              if lineno:
     180                  lineno = int(lineno)
     181              else:
     182                  lineno = None
     183              rule = Rule(docname, lineno, issue, text)
     184              rules.append(rule)
     185          f.close()
     186          self.logger.info('done, %d rules loaded' % len(self.rules))
     187  
     188  
     189  def get_lineno(node):
     190      """Obtain line number information for a node."""
     191      lineno = None
     192      while lineno is None and node:
     193          node = node.parent
     194          lineno = node.line
     195      return lineno
     196  
     197  
     198  def extract_line(text, index):
     199      """text may be a multiline string; extract
     200      only the line containing the given character index.
     201  
     202      >>> extract_line("abc\ndefgh\ni", 6)
     203      >>> 'defgh'
     204      >>> for i in (0, 2, 3, 4, 10):
     205      ...   print extract_line("abc\ndefgh\ni", i)
     206      abc
     207      abc
     208      abc
     209      defgh
     210      defgh
     211      i
     212      """
     213      p = text.rfind('\n', 0, index) + 1
     214      q = text.find('\n', index)
     215      if q < 0:
     216          q = len(text)
     217      return text[p:q]
     218  
     219  
     220  class ESC[4;38;5;81mSuspiciousVisitor(ESC[4;38;5;149mnodesESC[4;38;5;149m.ESC[4;38;5;149mGenericNodeVisitor):
     221  
     222      lastlineno = 0
     223  
     224      def __init__(self, document, builder):
     225          nodes.GenericNodeVisitor.__init__(self, document)
     226          self.builder = builder
     227  
     228      def default_visit(self, node):
     229          if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
     230              text = node.astext()
     231              # lineno seems to go backwards sometimes (?)
     232              self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
     233              seen = set() # don't report the same issue more than only once per line
     234              for match in detect_all(text):
     235                  issue = match.group()
     236                  line = extract_line(text, match.start())
     237                  if (issue, line) not in seen:
     238                      self.builder.check_issue(line, lineno, issue)
     239                      seen.add((issue, line))
     240  
     241      unknown_visit = default_visit
     242  
     243      def visit_document(self, node):
     244          self.lastlineno = 0
     245  
     246      def visit_comment(self, node):
     247          # ignore comments -- too much false positives.
     248          # (although doing this could miss some errors;
     249          # there were two sections "commented-out" by mistake
     250          # in the Python docs that would not be caught)
     251          raise nodes.SkipNode