1 """
2 Try to detect suspicious constructs, resembling markup
3 that has leaked into the final output.
4
5 Suspicious lines are reported in a comma-separated-file,
6 ``suspicious.csv``, located in the output directory.
7
8 The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15 It is common to find many false positives. To avoid reporting them
16 again and again, they may be added to the ``ignored.csv`` file
17 (located in the configuration directory). The file has the same
18 format as ``suspicious.csv`` with a few differences:
19
20 - each line defines a rule; if the rule matches, the issue
21 is ignored.
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
27 context.
28
29 Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36 The simplest way to create the ignored.csv file is by copying
37 undesired entries from suspicious.csv (possibly trimming the last
38 field.)
39
40 Copyright 2009 Gabriel A. Genellina
41
42 """
43
44 import os
45 import re
46 import csv
47
48 from docutils import nodes
49 from sphinx.builders import Builder
50 import sphinx.util
51
52 detect_all = re.compile(r'''
53 ::(?=[^=])| # two :: (but NOT ::=)
54 :[a-zA-Z][a-zA-Z0-9]+| # :foo
55 `| # ` (seldom used by itself)
56 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
57 ''', re.VERBOSE).finditer
58
59
60 class ESC[4;38;5;81mRule:
61 def __init__(self, docname, lineno, issue, line):
62 """A rule for ignoring issues"""
63 self.docname = docname # document to which this rule applies
64 self.lineno = lineno # line number in the original source;
65 # this rule matches only near that.
66 # None -> don't care
67 self.issue = issue # the markup fragment that triggered this rule
68 self.line = line # text of the container element (single line only)
69 self.used = False
70
71 def __repr__(self):
72 return '{0.docname},,{0.issue},{0.line}'.format(self)
73
74
75
76 class ESC[4;38;5;81mdialect(ESC[4;38;5;149mcsvESC[4;38;5;149m.ESC[4;38;5;149mexcel):
77 """Our dialect: uses only linefeed as newline."""
78 lineterminator = '\n'
79
80
81 class ESC[4;38;5;81mCheckSuspiciousMarkupBuilder(ESC[4;38;5;149mBuilder):
82 """
83 Checks for possibly invalid markup that may leak into the output.
84 """
85 name = 'suspicious'
86 logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder")
87
88 def init(self):
89 # create output file
90 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
91 open(self.log_file_name, 'w').close()
92 # load database of previously ignored issues
93 self.load_rules(os.path.join(os.path.dirname(__file__), '..',
94 'susp-ignored.csv'))
95
96 def get_outdated_docs(self):
97 return self.env.found_docs
98
99 def get_target_uri(self, docname, typ=None):
100 return ''
101
102 def prepare_writing(self, docnames):
103 pass
104
105 def write_doc(self, docname, doctree):
106 # set when any issue is encountered in this document
107 self.any_issue = False
108 self.docname = docname
109 visitor = SuspiciousVisitor(doctree, self)
110 doctree.walk(visitor)
111
112 def finish(self):
113 unused_rules = [rule for rule in self.rules if not rule.used]
114 if unused_rules:
115 self.logger.warning(
116 'Found %s/%s unused rules: %s' % (
117 len(unused_rules), len(self.rules),
118 '\n'.join(repr(rule) for rule in unused_rules),
119 )
120 )
121 return
122
123 def check_issue(self, line, lineno, issue):
124 if not self.is_ignored(line, lineno, issue):
125 self.report_issue(line, lineno, issue)
126
127 def is_ignored(self, line, lineno, issue):
128 """Determine whether this issue should be ignored."""
129 docname = self.docname
130 for rule in self.rules:
131 if rule.docname != docname: continue
132 if rule.issue != issue: continue
133 # Both lines must match *exactly*. This is rather strict,
134 # and probably should be improved.
135 # Doing fuzzy matches with levenshtein distance could work,
136 # but that means bringing other libraries...
137 # Ok, relax that requirement: just check if the rule fragment
138 # is contained in the document line
139 if rule.line not in line: continue
140 # Check both line numbers. If they're "near"
141 # this rule matches. (lineno=None means "don't care")
142 if (rule.lineno is not None) and \
143 abs(rule.lineno - lineno) > 5: continue
144 # if it came this far, the rule matched
145 rule.used = True
146 return True
147 return False
148
149 def report_issue(self, text, lineno, issue):
150 self.any_issue = True
151 self.write_log_entry(lineno, issue, text)
152 self.logger.warning('[%s:%d] "%s" found in "%-.120s"' %
153 (self.docname, lineno, issue, text))
154 self.app.statuscode = 1
155
156 def write_log_entry(self, lineno, issue, text):
157 f = open(self.log_file_name, 'a')
158 writer = csv.writer(f, dialect)
159 writer.writerow([self.docname, lineno, issue, text.strip()])
160 f.close()
161
162 def load_rules(self, filename):
163 """Load database of previously ignored issues.
164
165 A csv file, with exactly the same format as suspicious.csv
166 Fields: document name (normalized), line number, issue, surrounding text
167 """
168 self.logger.info("loading ignore rules... ", nonl=1)
169 self.rules = rules = []
170 try:
171 f = open(filename, 'r')
172 except IOError:
173 return
174 for i, row in enumerate(csv.reader(f)):
175 if len(row) != 4:
176 raise ValueError(
177 "wrong format in %s, line %d: %s" % (filename, i+1, row))
178 docname, lineno, issue, text = row
179 if lineno:
180 lineno = int(lineno)
181 else:
182 lineno = None
183 rule = Rule(docname, lineno, issue, text)
184 rules.append(rule)
185 f.close()
186 self.logger.info('done, %d rules loaded' % len(self.rules))
187
188
189 def get_lineno(node):
190 """Obtain line number information for a node."""
191 lineno = None
192 while lineno is None and node:
193 node = node.parent
194 lineno = node.line
195 return lineno
196
197
198 def extract_line(text, index):
199 """text may be a multiline string; extract
200 only the line containing the given character index.
201
202 >>> extract_line("abc\ndefgh\ni", 6)
203 >>> 'defgh'
204 >>> for i in (0, 2, 3, 4, 10):
205 ... print extract_line("abc\ndefgh\ni", i)
206 abc
207 abc
208 abc
209 defgh
210 defgh
211 i
212 """
213 p = text.rfind('\n', 0, index) + 1
214 q = text.find('\n', index)
215 if q < 0:
216 q = len(text)
217 return text[p:q]
218
219
220 class ESC[4;38;5;81mSuspiciousVisitor(ESC[4;38;5;149mnodesESC[4;38;5;149m.ESC[4;38;5;149mGenericNodeVisitor):
221
222 lastlineno = 0
223
224 def __init__(self, document, builder):
225 nodes.GenericNodeVisitor.__init__(self, document)
226 self.builder = builder
227
228 def default_visit(self, node):
229 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
230 text = node.astext()
231 # lineno seems to go backwards sometimes (?)
232 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
233 seen = set() # don't report the same issue more than only once per line
234 for match in detect_all(text):
235 issue = match.group()
236 line = extract_line(text, match.start())
237 if (issue, line) not in seen:
238 self.builder.check_issue(line, lineno, issue)
239 seen.add((issue, line))
240
241 unknown_visit = default_visit
242
243 def visit_document(self, node):
244 self.lastlineno = 0
245
246 def visit_comment(self, node):
247 # ignore comments -- too much false positives.
248 # (although doing this could miss some errors;
249 # there were two sections "commented-out" by mistake
250 # in the Python docs that would not be caught)
251 raise nodes.SkipNode