#! /usr/bin/env python3 ############################################## # Extract symbol names from a .ind file. # # Author: Scott Pakin # ############################################## import re import sys from collections import defaultdict # Define regular expressions to pick out a symbol name (as invoked from a # document) and implementation (which may be specific to the CLSL). name_re = re.compile(r'\\(?:sub)?item (.+?)\s+\(') impl_re = re.compile(r'\s+\(\$?([^\)$]+)\$?\)\\pfill') # Define a regular expression to replace \spverb+ANYTHING+ with ANYTHING. verb_re = re.compile(r'\\(?:sp)?verb\+([^+]+)\+') # Define a regular expression to replace \blackacc{ANYTHING} (and similar # constructs) with ANYTHING. blackacc_re = re.compile(r'\\blackacc(?:hack|two)?\{([^\}]+)\}') def strip_wrappers(s): 'Remove uninteresting wrapper calls around an implementation string.' # Handle both "\blackacc{\xyz}" and "\blackacc\xyz" formulations. # Remove "\ensuremath{...}". Remove various special-case expressions # that appear in the index. wrappers = [ (r'\blackacchack{', '}'), (r'\blackacctwo{', '}'), (r'\blackacc{', '}'), (r'\blackacchack', ''), (r'\blackacctwo', ''), (r'\blackacc', ''), (r'\ensuremath{', '}'), (r'\mbox', '}'), (r'\strut', ''), (r'\smash', ''), (r'', r'\relax'), (r'', r'\hspace{0.5em}'), ] changed = True while changed: changed = False for w0, w1 in wrappers: if s.startswith(w0) and s.endswith(w1): s = s[len(w0):len(s) - len(w1)] s = s.strip() changed = True return s def extract_name(s): 'Extract a control sequence (with optional arguments) from a string.' if s.startswith(r'\href{'): return None s = verb_re.sub(r'\1', s) s = s.replace(r'\-', '') s = s.replace(r'\linebreak[0]', '') s = s.strip() if s.startswith('\\') or len(s) == 1: return s return None def contains_stray_spaces(s): 'Return True if a string contains spaces outside of curly braces.' depth = 0 for i, c in enumerate(s): if c == ' ' and depth == 0: return True elif c == '{': depth += 1 elif c == '}': depth -= 1 return False def clean_dict(i2n): '''Replace names of None with a copy of the implementation. Filter out non-control-sequences.''' clean = {} for k, v in i2n.items(): if v is None: v = k if '\\' not in k and '\\' not in v: continue if v.startswith(r'\\texttt{') and v.endswith('}'): v = v[9:-1] # Special case for "|" clean[k] = v return clean def impl_sort_key(k): 'Return a key for sorting impl_to_name.' global impl_to_name v = impl_to_name[k] or k return (v.lower(), v, k.lower(), k) ########################################################################### # Process the input file line-by-line. impl_to_name = defaultdict(lambda: None) with open(sys.argv[1]) as r: for ln in r: # Find the symbol implementation. Continue with the next line if # no implementation is found or the implementation contains a space # (as in, e.g., "xyz package option"). match = impl_re.search(ln) if match is None: continue impl = strip_wrappers(match[1]) if r'\,' in impl or contains_stray_spaces(impl): continue # Find the symbol name. Use None if there is no name or the name # is not a control sequence. name = None match = name_re.search(ln) if match is not None: name = extract_name(match[1]) # Store the mapping. Don't replace non-None with None, though. if impl_to_name[impl] is None: impl_to_name[impl] = name # Sort and output the list of symbols. impl_to_name = clean_dict(impl_to_name) max_name_len = max([len(v) for k, v in impl_to_name.items()]) impls = sorted(impl_to_name, key=impl_sort_key) for k in impls: print('%-*.*s %s' % (max_name_len, max_name_len, impl_to_name[k], k))