Source code for iocdoc.token_support

'''
Applies Python :mod:`tokenize` analysis to each line of a text file.
'''


import token
import tokenize
import utils
import text_file


PRINT_DIAGNOSTICS = False


[docs]class TokenLog(): ''' Applies the Python <code>tokenize</code> analysis to each line of a file. This allows a lexical analysis of the file, line-by-line. This is powerful and makes some complex analyses more simple but it assumes the file resembles Python source code. :note The <code>tokenize</code> analysis is not robust. Some files will cause exceptions for various reasons. :see http://docs.python.org/library/tokenize.html :see http://docs.python.org/library/token.html ''' def __init__(self): ''' Constructor ''' self.tokenList = [] self.xref = {} self.nameTable = token.tok_name self.nameTable[tokenize.COMMENT] = 'COMMENT' self.nameTable[tokenize.NL] = 'NEWLINE' self.token_pointer = None if PRINT_DIAGNOSTICS: print "\n".join( sorted(self.nameTable.values()) )
[docs] def get(self, index): ''' retrieve the indexed token from the list ''' return self.tokenList[index]
[docs] def tokenName(self, tokType): ''' convert token number to a useful string ''' return self.nameTable[tokType]
[docs] def tokenReceiver(self, tokType, tokStr, start, end, tokLine): ''' called by tokenize package, logs tokens as they are called ''' tokName = self.tokenName(tokType) tok_dict = { 'tokName': tokName, 'tokType': tokType, 'tokStr': tokStr, 'start': start, 'end': end, 'tokLine': tokLine, } self.tokenList.append( tok_dict ) if not tokName in self.xref: self.xref[tokName] = [] self.xref[tokName].append( len(self.tokenList)-1 )
[docs] def getTokenList(self): ''' :return: list of token dictionaries ''' return self.tokenList
[docs] def getCrossReferences(self): ''' :return: dictionary of token cross-references ''' return self.xref
# def report(self): # ''' # prints (to stdout) results contained in tokenList list and xref dictionary # ''' # print len(self.tokenList), "tokens were found" # print len(self.xref), "different kinds of tokens were found" # for k, v in self.xref.items(): # print k, len(v), "[", # for index in v: # report_dict = self.tokenList[index] # print report_dict['start'][0], # print "]" # for k in ['OP', 'NAME', 'STRING']: # if k in self.xref: # for index in self.xref[k]: # report_dict = self.tokenList[index] # print k, report_dict['start'], "|" + report_dict['tokStr'].strip() + "|" # # def summary(self, alsoPrint = False): # ''' # Summarizes the xref dictionary contents. # Reports number of each different token name (type). # # :param alsoPrint: boolean to enable print to stdout # :return: dictionary of token frequencies # ''' # summary_dict = {k: len(v) for k, v in self.xref.items()} # if alsoPrint: # for k, v in sorted(summary_dict.items()): # print "%s : %d" % (k, v) # return summary_dict
[docs] def processFile(self, filename): ''' process just one file ''' f = text_file.read(filename) # use the file cache try: tokenize.tokenize(f.iterator().readline, self.tokenReceiver) except Exception, _exc: msg = 'trouble understanding: ' + f.absolute_filename msg += '\n' + str(_exc) utils.logMessage(msg, utils.LOGGING_DETAIL__NOISY) raise RuntimeError(msg) self.token_pointer = None utils.logMessage('tokenized file: ' + f.filename, utils.LOGGING_DETAIL__NOISY)
[docs] def lineAnalysis(self): ''' analyze the tokens by line :return dictionary with all the lines, including tokenized form ''' # build a dictionary with all the lines, and a list of all the lines, in order lines = {'numbers':[]} longest = len(self.tokenList) lastProgress = None for tok in self.tokenList: lineNum = tok['start'][0] progress = lineNum*100/longest if progress != lastProgress: lastProgress = progress #if (progress % 5) == 0: # print "%3d%%" % progress if not lineNum in lines['numbers']: # remember the order the line numbers came in lines['numbers'].append( lineNum ) # each line number has a dictionary with: # - (string) full text of the line from f.readline() # - (string) token pattern # - (list) tuple: tokName, tokType, tokStr, start, end lines[lineNum] = {} lines[lineNum]['pattern'] = [] lines[lineNum]['tokens'] = [] lines[lineNum]['readline'] = tok['tokLine'] # initially, pattern is a list of token names lines[lineNum]['pattern'].append( tok['tokName'] ) item = { 'tokName': tok['tokName'], 'tokType': tok['tokType'], 'tokStr': tok['tokStr'], 'start': tok['start'], 'end': tok['end'] } lines[lineNum]['tokens'].append( item ) # change pattern from list to string for line in lines['numbers']: pat = lines[line]['pattern'] lines[line]['pattern'] = " ".join( pat ) # don't retain this list locally, just return it to the caller return lines
[docs] def setTokenPointer(self, position = None): ''' set the token pointer to the given position :param position: index position within list of tokens :raise Exception: token pointer position errors ''' if position != None: if position < 0: # allow easy Pythonic reference to the last indices position = len(self.tokenList) + position if position < 0: raise Exception, "position cannot be a negative number" if position >= len(self.tokenList): raise Exception, "position cannot be greater than or equal to number of tokens" self.token_pointer = position return self.tokenList[position]
def getCurrentToken(self): return self.tokenList[self.token_pointer]
[docs] def next(self): ''' return the next token or raise a StopIteration exception upon reaching the end of the sequence :return: token object :raise StopIteration: reached the end of the sequence ''' if self.token_pointer == len(self.tokenList) - 1: raise StopIteration if self.token_pointer == None: self.token_pointer = -1 self.token_pointer += 1 return self.tokenList[self.token_pointer]
[docs] def previous(self): ''' return the previous token :return: token object :raise StopIteration: reached the beginning of the sequence ''' if self.token_pointer == 0: raise StopIteration if self.token_pointer == None: self.token_pointer = 0 self.token_pointer -= 1 return self.tokenList[self.token_pointer]
[docs] def nextActionable(self, skip_list=None): ''' walk through the tokens and find the next actionable token :param (str) skip_list: list of tokens to ignore default list: ('COMMENT', 'NEWLINE', 'ENDMARKER', 'ERRORTOKEN', 'INDENT', 'DEDENT') :return: token object or None if no more tokens ''' # TODO: can this become an iterator? if skip_list is None: skip_these_tokens = ''' COMMENT NEWLINE ENDMARKER ERRORTOKEN INDENT DEDENT '''.strip().split() else: skip_these_tokens = skip_list found = False while not found: try: token = self.next() except StopIteration: return None if token['tokName'] not in skip_these_tokens: found = True return token
def _print_token_(self, tkn): '''developer use''' print '%3d,%3d' % tkn['start'], print '%10s' % tkn['tokName'], print '|%15s|' % tkn['tokStr'].strip(), print '|%s|' % tkn['tokLine'].strip()
[docs] def tokens_to_list(self): ''' parse an enclosed list of tokens into a list Assume ``token_pointer`` is pointing at start terminator examples:: (DESC, "motor $(P)$(M)") --> ['DESC', 'motor $(P)$(M)'] {P, S, BL, T1, T2, A} --> ['P', 'S', 'BL', 'T1', 'T2', 'A'] {12ida1: A "##ID" 1 2 1} --> ['12ida1:', 'A', '##ID', '1', '2', '1'] TODO: alias($(IOC):IOC_CPU_LOAD,"$(IOC):load") ''' # first, decide the list terminators tok = self.getCurrentToken() t_start = token_key(tok) if t_start not in ('OP (', 'OP {'): msg = 'incorrect token type' raise ValueError, msg t_end = {'OP (': 'OP )', 'OP {': 'OP }'}[t_start] #content_names = ('NAME', 'NUMBER', 'OP', 'STRING', 'ERRORTOKEN') skip_list = ('COMMENT', 'NEWLINE', 'ENDMARKER', #'ERRORTOKEN', 'INDENT', 'DEDENT') v = '' end = tok['start'][1] items = [] depth = 1 while depth>0 or token_key(tok) not in ('', t_end): tok = self.nextActionable(skip_list) key = token_key(tok) if key == t_start: depth += 1 elif key == t_end: depth -= 1 if depth == 0: break if tok['start'][1] == end and key != 'OP ,': v += tok['tokStr'] end = tok['end'][1] else: if len(v) > 0: v = utils.strip_quotes(v) if len(v) == 0: v = '""' items.append(v) if key not in (t_end, 'OP ,'): v = tok['tokStr'] else: v='' end = tok['end'][1] if len(v) > 0: # last chance v = utils.strip_quotes(v) if len(v) == 0: v = '""' items.append(v) return items
[docs] def getFullWord(self, skip_list=None): ''' parse the token stream for a contiguous word and return it as str Some words in template files might not be enclosed in quotes and thus the whole word is broken into several tokens. This command rebuilds the word, without stripping quotes (if provided). ''' tok = self.getCurrentToken() end = tok['start'][1] v = '' while tok is not None: if tok['start'][1] == end: v += tok['tokStr'] end = tok['end'][1] else: break tok = self.nextActionable(skip_list) if v.endswith('{'): # moved from template.py # watch for patterns such as this: "../../33iddApp/Db/filterDrive.db"{ v = v[:-1] tok = self.setTokenPointer(self.token_pointer-1) # undo last nextActionable() while token_key(tok) != 'OP {': tok = self.setTokenPointer(self.token_pointer-1) # back up return v
[docs] def getKeyValueSet(self): ''' parse a token sequence as a list of macro definitions into a dictionary example:: { P=12ida1:,SCANREC=12ida1:scan1 } {P=12ida1:,SCANREC=12ida1:scan1,Q=m1,POS="$(Q).VAL",RDBK="$(Q).RBV"} ''' # TODO: what about reset definitions? {P=,SCANREC=} kv = {} for definition in self.tokens_to_list(): k, v = [_.strip('"') for _ in definition.split('=')] kv[k.strip()] = v return kv
[docs]def token_key(tkn): '''developer use, short string identifying the type and text of this token''' if tkn is None: m = '' else: m = tkn['tokName'] + ' ' + tkn['tokStr'] return m
[docs]def parse_bracketed_macro_definitions(tokenLog): ''' walk through a bracketed string, keeping track of delimiters verify we start on an opening delimiter ''' analysis = _find_sections(tokenLog) token_dividers = [analysis['start'], analysis['end']] for key in 'commas equals'.split(): token_dividers += analysis[key] token_dividers.sort() if len(analysis['commas']) == 0 and len(analysis['equals']) == 0: # No delimiters found: either no macro, 1 macro, or space-delimited. # Cannot become a dict since no "=" were found. # Look at all tokens between the enclosure, # accumulate contiguous text, # break on non-contiguous boundaries # Note: makes no assumption about all on one line. s, f = token_dividers l, c = tokenLog.get(s)['end'] text = '' parts = [] for i in range(s+1, f): tok = tokenLog.get(i) if tok['start'][1] != c or tok['start'][0] != l: if len(text) > 0: parts.append(text) text = tok['tokStr'] else: text += tok['tokStr'] l, c = tokenLog.get(i)['end'] if len(text) > 0: parts.append(text) return parts text_list = [] for index, key in enumerate(token_dividers[0:-1]): s = key+1 f = token_dividers[index+1] text_list.append( _rebuild_text([tokenLog.get(_) for _ in range(s, f)]) ) if len(analysis['commas']) > len(analysis['equals']): return text_list else: # tricky: http://stackoverflow.com/questions/6900955/python-convert-list-to-dictionary # if text_list = ['a', 'b', 'c', 'd'] # this returns dict(a='b', c='d') return dict(zip(text_list[::2], text_list[1::2]))
def _find_sections(tokenLog): ''' locate the tokens that divide this sequence into sections The overall section is delimited by {} or (). Internally, the delimiters are , or =. All the rest (that is not a comment) is string content to be kept. Return the sections as a a dictionary with these members: * 'open': token number for the opening symbol * 'commas': list of token numbers for comma delimiters * 'equals': list of token numbers for equal sign delimiters * 'close': token number for the matching closing symbol ''' terminator = { '{': 'OP }', '(': 'OP )', } tok = tokenLog.getCurrentToken() pt_start = tokenLog.token_pointer c = tok['tokStr'] if c in terminator: tk_start = token_key(tok) tk_end = terminator[c] tok = tokenLog.nextActionable() else: tk_start, tk_end = None, None pt_start -= 1 # l, c = tok['start'] # msg = '(%d,%d) ' % (l, c+1) # msg += 'token stream not starting with "(" or "{"' # raise KeyError, msg depth = 1 commas = [] equals = [] while depth > 0: tk = token_key(tok) if tk == 'OP ,' and depth == 1: commas.append(tokenLog.token_pointer) elif tk == 'OP =' and depth == 1: equals.append(tokenLog.token_pointer) elif tk == tk_start: depth += 1 elif tk == tk_end or tok is None: depth -= 1 if depth == 0: pt_end = tokenLog.token_pointer break tok = tokenLog.nextActionable() return dict( start = pt_start, end = pt_end, commas = commas, equals = equals, ) def _rebuild_text(token_list): ''' reconstruct the text from the list of tokens ''' text = '' for tok in token_list: # Q: What if tok['tokName'] is a COMMENT or other undesirable? # A: not common in macro definitions, fix code if this is seen # Q: what about line number or column number gaps between tokens? # A: addressed above, do not mix comma delimited and whitespace delimited text += tok['tokStr'] return text
[docs]def reconstruct_line(tokens = [], firstIndex = 1, no_comments=True): ''' reconstruct the line from the list of tokens presented :param [tok_dict] tokens: as used throughout this module :param int firstIndex: first index in tokens list to use :param bool no_comments: True (default) to stop reconstructing at the first comment token :return: reconstructed line ''' cmd = "" for tkn in tokens[firstIndex:]: if tkn['tokName'] == 'COMMENT' and no_comments: break if tkn['tokName'] not in ('NEWLINE'): start = tkn['start'][1] cmd += " "*(start - len(cmd)) cmd += tkn['tokStr'] return cmd
###################################################################### def main(): filename = __file__ obj = TokenLog() obj.processFile(filename) obj.summary(True) analysis = obj.lineAnalysis() for number in analysis['numbers']: pattern = analysis[number]['pattern'] print number if pattern not in ('NEWLINE', 'ENDMARK', 'COMMENT NEWLINE', ): print number, pattern, analysis[number]['readline'].strip() for _i in range(5): print str(obj.nextActionable()) obj.setTokenPointer(-10) tok = obj.nextActionable() while tok != None: print str(tok) tok = obj.nextActionable() if __name__ == '__main__': main()