doc_postprocess.py

   1 #!/usr/bin/env python3
   2
   3 # doc_postprocess.py [-h|--help] <pattern>...
   4
   5 # Post-process the Doxygen-generated HTML files matching pattern.
   6
   7 import os
   8 import sys
   9 import re
  10 import glob
  11
  12 # Substitutions with regular expressions are somewhat slow in Python 3.9.5.
  13 # Use str.replace() rather than re.sub() where possible.
  14
  15 # [search string, compiled regular expression or None, substitution string, count]
  16 class_el_patterns = [
  17   # return value
  18   [ ' &amp;&nbsp;', re.compile(r' &amp;&nbsp; *'), '&amp;&#160;', 1],
  19   [ ' *&nbsp;', re.compile(r' \*&nbsp; *'), '*&#160;', 1],
  20   # parameters
  21   [ ' &amp;', None, '&amp;', 0],
  22   [ '&amp;', re.compile(r'&amp;\b'), '&amp; ', 0],
  23   [ ' *', None, '*', 0],
  24   [ '*', re.compile(r'\*\b'), '* ', 0],
  25   # templates
  26   [ 'template&lt;', re.compile(r'\btemplate&lt;'), 'template &lt;', 1]
  27 ]
  28
  29 class_md_patterns = [
  30   # left parenthesis
  31   [ '(&nbsp;', re.compile(r'\(&nbsp; *'), '(', 1],
  32   # return value
  33   [ ' &amp; ', None, '&amp; ', 0],
  34   [ ' * ', None, '* ', 0],
  35   # parameters
  36   [ ' &amp;&nbsp;', re.compile(r' &amp;&nbsp; *'), '&amp;&#160;', 0],
  37   [ ' *&nbsp;', re.compile(r' \*&nbsp; *'), '*&#160;', 0],
  38   # templates
  39   [ 'template&lt;', re.compile(r'\btemplate&lt;'), 'template &lt;', 1]
  40 ]
  41
  42 else_patterns = [
  43   # template decls
  44   [ 'template&lt;', re.compile(r'^(<h\d>|)template&lt;'), '\\1template &lt;', 1]
  45 ]
  46
  47 all_lines_patterns = [
  48   # For some reason, some versions of Doxygen output the full path to
  49   # referenced tag files. This is bad since it breaks doc_install.py,
  50   # and also because it leaks local path names into source tarballs.
  51   # Thus, strip the directory prefix here.
  52   [ ' doxygen="', re.compile(r' doxygen="[^":]*/([^":]+\.tag):'), ' doxygen="\\1:', 0],
  53
  54   [ '&copy;', None, '&#169;', 0],
  55   [ '&mdash;', None, '&#8212;', 0],
  56   [ '&ndash;', None, '&#8211;', 0],
  57   [ '&nbsp;', re.compile(r' *&nbsp; *'), '&#160;', 0]
  58 ]
  59
  60 def doc_postprocess(patterns):
  61   if not (isinstance(patterns, list) or isinstance(patterns, tuple)):
  62     patterns = [] if patterns == None else [patterns]
  63
  64   filepaths = []
  65   for pattern in patterns:
  66     filepaths += glob.glob(pattern)
  67
  68   for filepath in filepaths:
  69     # Assume that the file is UTF-8 encoded.
  70     # If illegal UTF-8 bytes in the range 0x80..0xff are encountered, they are
  71     # replaced by Unicode Private Use characters in the range 0xdc80..0xdcff
  72     # and restored to their original values when the file is rewritten.
  73     with open(filepath, mode='r', encoding='utf-8', errors='surrogateescape') as file:
  74       # Read the whole file into a buffer, a list with one line per element.
  75       buf = file.readlines()
  76
  77     for line_number in range(len(buf)):
  78       line = buf[line_number]
  79
  80       # Substitute
  81       if '<a class="el"' in line:
  82         for subst in class_el_patterns:
  83           if subst[0] in line:
  84             if subst[1]:
  85               line = subst[1].sub(subst[2], line, count=subst[3])
  86             else:
  87               line = line.replace(subst[0], subst[2], subst[3])
  88
  89       elif ('<td class="md"' in line) or ('<td class="mdname"' in line):
  90         for subst in class_md_patterns:
  91           if subst[0] in line:
  92             if subst[1]:
  93               line = subst[1].sub(subst[2], line, count=subst[3])
  94             else:
  95               line = line.replace(subst[0], subst[2], subst[3])
  96
  97       else:
  98         for subst in else_patterns:
  99           if subst[0] in line:
 100             if subst[1]:
 101               line = subst[1].sub(subst[2], line, count=subst[3])
 102             else:
 103               line = line.replace(subst[0], subst[2], subst[3])
 104
 105       for subst in all_lines_patterns:
 106         if subst[0] in line:
 107           if subst[1]:
 108             line = subst[1].sub(subst[2], line, count=subst[3])
 109           else:
 110             line = line.replace(subst[0], subst[2], subst[3])
 111
 112       buf[line_number] = line
 113
 114     with open(filepath, mode='w', encoding='utf-8', errors='surrogateescape') as file:
 115       # Write the whole buffer back into the file.
 116       file.writelines(buf)
 117
 118   return 0
 119
 120 # ----- Main -----
 121 if __name__ == '__main__':
 122   import argparse
 123
 124   parser = argparse.ArgumentParser(
 125     description='Post-process the Doxygen-generated HTML files matching pattern.')
 126   parser.add_argument('patterns', nargs='*', metavar='pattern', help='filename pattern')
 127   args = parser.parse_args()
 128   print(args.patterns)
 129
 130   sys.exit(doc_postprocess(args.patterns))