# -*- coding: UTF-8 -*- # /** # * Software Name : pycrate # * Version : 0.4 # * # * Copyright 2016. Benoit Michau. ANSSI. # * # * This library is free software; you can redistribute it and/or # * modify it under the terms of the GNU Lesser General Public # * License as published by the Free Software Foundation; either # * version 2.1 of the License, or (at your option) any later version. # * # * This library is distributed in the hope that it will be useful, # * but WITHOUT ANY WARRANTY; without even the implied warranty of # * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # * Lesser General Public License for more details. # * # * You should have received a copy of the GNU Lesser General Public # * License along with this library; if not, write to the Free Software # * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, # * MA 02110-1301 USA # * # *-------------------------------------------------------- # * File Name : pycrate_asn1c/utils.py # * Created : 2016-03-02 # * Authors : Benoit Michau # *-------------------------------------------------------- # */ import re import pprint from keyword import iskeyword # pycrate_core is used only for basic library-wide functions / variables: # log(), python_version, integer_types, str_types from pycrate_core.utils import * from .err import ASN1Err # ------------------------------------------------------------------------------# # asn1-wide Python routines # ------------------------------------------------------------------------------# def asnlog(msg): """ customizable logging function for the whole asn1 part """ log(msg) _PP = pprint.PrettyPrinter(indent=1, width=80, depth=None, stream=None) def pprint(obj): return _PP.pprint(obj) def pformat(obj): return _PP.pformat(obj) # ------------------------------------------------------------------------------# # asn1-wide Python variables and identifiers # ------------------------------------------------------------------------------# # list of ASN.1 OIDs required to be "known" by the compiler ASN1_OID_ISO = { ('itu-t',): 0, ('ccitt',): 0, (0, 'recommendation'): 0, (0, 0, 'a'): 1, (0, 0, 'b'): 2, (0, 0, 'c'): 3, (0, 0, 'd'): 4, (0, 0, 'e'): 5, (0, 0, 'f'): 6, (0, 0, 'g'): 7, (0, 0, 'h'): 8, (0, 0, 'i'): 9, (0, 0, 'j'): 10, (0, 0, 'k'): 11, (0, 0, 'l'): 12, (0, 0, 'm'): 13, (0, 0, 'n'): 14, (0, 0, 'o'): 15, (0, 0, 'p'): 16, (0, 0, 'q'): 17, (0, 0, 'r'): 18, (0, 0, 's'): 19, (0, 0, 't'): 20, (0, 0, 'u'): 21, (0, 0, 'v'): 22, (0, 0, 'w'): 23, (0, 0, 'x'): 24, (0, 0, 'y'): 25, (0, 0, 'z'): 26, (0, 'question'): 1, (0, 'administration'): 2, (0, 'network-operator'): 3, (0, 'identified-organization'): 4, ('iso',): 1, (1, 'standard'): 0, (1, 'registration-authority'): 1, (1, 'member-body'): 2, (1, 2, 'f'): 250, (1, 'identified-organization'): 3, ('joint-iso-itu-t',): 2, ('joint-iso-ccitt',): 2, (2, 'asn1'): 1, (2, 1, 'basic-encoding'): 1, (2, 1, 'ber-derived'): 2, (2, 1, 'packed-encoding'): 3, (2, 'mhs-motif'): 6, (2, 'ms'): 9, (2, 'registration-procedures'): 17, } ### # DO NOT CHANGE the following identifiers # as many of them correspond directly to the ASN.1 syntax ### # ASN.1 object mode MODE_VALUE = 'VALUE' MODE_SET = 'SET' MODE_TYPE = 'TYPE' # ASN.1 type identifiers # basic types TYPE_NULL = 'NULL' TYPE_BOOL = 'BOOLEAN' TYPE_INT = 'INTEGER' TYPE_REAL = 'REAL' TYPE_ENUM = 'ENUMERATED' TYPE_BIT_STR = 'BIT STRING' TYPE_OCT_STR = 'OCTET STRING' TYPE_OID = 'OBJECT IDENTIFIER' TYPE_REL_OID = 'RELATIVE-OID' # string types TYPE_STR_IA5 = 'IA5String' TYPE_STR_PRINT = 'PrintableString' TYPE_STR_NUM = 'NumericString' TYPE_STR_VIS = 'VisibleString' TYPE_STR_BMP = 'BMPString' TYPE_STR_UTF8 = 'UTF8String' TYPE_STR_ISO646 = 'ISO646String' TYPE_STR_TELE = 'TeletexString' TYPE_STR_VID = 'VideotexString' TYPE_STR_GRAPH = 'GraphicString' TYPE_STR_T61 = 'T61String' TYPE_STR_GENE = 'GeneralString' TYPE_STR_UNIV = 'UniversalString' TYPE_OBJ_DESC = 'ObjectDescriptor' # time types TYPE_TIME_GEN = 'GeneralizedTime' TYPE_TIME_UTC = 'UTCTime' # constructed types TYPE_CHOICE = 'CHOICE' TYPE_SEQ = 'SEQUENCE' TYPE_SEQ_OF = 'SEQUENCE OF' TYPE_SET = 'SET' TYPE_SET_OF = 'SET OF' # wrapper types TYPE_OPEN = 'OPEN_TYPE' TYPE_ANY = 'ANY' TYPE_EXT = 'EXTERNAL' TYPE_EMB_PDV = 'EMBEDDED PDV' TYPE_CHAR_STR = 'CHARACTER STRING' # info object TYPE_CLASS = 'CLASS' TYPE_TYPEIDENT = 'TYPE-IDENTIFIER' TYPE_ABSSYNT = 'ABSTRACT-SYNTAX' TYPE_INSTOF = 'INSTANCE OF' # string types TYPE_STRINGS = (TYPE_STR_IA5, TYPE_STR_PRINT, TYPE_STR_NUM, TYPE_STR_VIS, TYPE_STR_BMP, TYPE_STR_UTF8, TYPE_STR_ISO646, TYPE_STR_TELE, TYPE_STR_VID, TYPE_STR_GRAPH, TYPE_STR_T61, TYPE_STR_GENE, TYPE_STR_UNIV, TYPE_OBJ_DESC) # types with constructed content TYPE_CONSTRUCT = (TYPE_SEQ_OF, TYPE_SET_OF, TYPE_CHOICE, TYPE_SEQ, TYPE_SET, TYPE_CLASS, TYPE_REAL, TYPE_EXT, TYPE_EMB_PDV) # types with potential SIZE constraint TYPE_CONST_SIZE = (TYPE_BIT_STR, TYPE_OCT_STR, TYPE_STR_IA5, TYPE_STR_PRINT, TYPE_STR_NUM, TYPE_STR_VIS, TYPE_STR_BMP, TYPE_STR_UTF8, TYPE_STR_ISO646, TYPE_STR_TELE, TYPE_STR_VID, TYPE_STR_GRAPH, TYPE_STR_T61, TYPE_STR_GENE, TYPE_STR_UNIV, TYPE_OBJ_DESC, TYPE_SEQ_OF, TYPE_SET_OF, TYPE_CHAR_STR) # ASN.1 tag identifers TAG_IMPLICIT = 'IMPLICIT' TAG_EXPLICIT = 'EXPLICIT' TAG_AUTO = 'AUTOMATIC' TAG_CONTEXT_SPEC = 'CONTEXT-SPECIFIC' TAG_PRIVATE = 'PRIVATE' TAG_APPLICATION = 'APPLICATION' TAG_UNIVERSAL = 'UNIVERSAL' # ASN.1 tag class canonical orderding TAG_CANON_ORDER = { TAG_UNIVERSAL: 0, TAG_APPLICATION: 1, TAG_CONTEXT_SPEC: 2, TAG_PRIVATE: 3 } # constraints supported for types CONST_VAL = 'VAL' # keys: 'root': list, # 'ext' : None or list CONST_SIZE = 'SIZE' # keys: 'root': list (of integer), # 'ext' : None or list CONST_CONTAINING = 'CONTAINING' # keys: 'obj' : ASN1Obj, # 'enc' : None or OID value CONST_ALPHABET = 'ALPHABET' # keys: 'root': list (of chars), # 'ext' : None or list CONST_COMPS = 'WITH COMPONENTS' # keys: 'root': list # 'ext': None or list # each component of the root / ext list is a # dict {'_abs' : list of absent ident, # '_pre' : list of present idents, # '$ident': {'const': [list of additional constraints for $ident]} # constraints supported for CLASS CONST_TABLE = 'TABLE' # keys: 'tab': CLASS set object gathering all root / ext values # 'at': str or None, # 'exc': str or None # constraints extacted but not supported at runtime CONST_COMP = 'WITH COMPONENT' # keys: none CONST_ENCODE_BY = 'ENCODE BY' # keys: None CONST_REGEXP = 'PATTERN' # keys: None CONST_CONSTRAIN_BY = 'CONSTRAINED BY' # keys: None CONST_PROPERTY = 'SETTINGS' # keys: none # specific flags for constructed types components and CLASS type fields FLAG_OPT = 'OPTIONAL' FLAG_UNIQ = 'UNIQUE' FLAG_DEF = 'DEFAULT' FLAG_DEFBY = 'DEFINED BY' # ------------------------------------------------------------------------------# # regexp for processing ASN.1 text # ------------------------------------------------------------------------------# # list of all ASN.1 keywords SYNT_KEYWORDS = ( 'ABSENT', 'ABSTRACT-SYNTAX', 'ALL', 'APPLICATION', 'AUTOMATIC', 'BEGIN', 'BIT', 'BMPString', 'BOOLEAN', 'BY', 'CHARACTER', 'CHOICE', 'CLASS', 'COMPONENT', 'COMPONENTS', 'CONSTRAINED', 'CONTAINING', 'DEFAULT', 'DEFINITIONS', 'EMBEDDED', 'ENCODED', 'END', 'ENUMERATED', 'EXCEPT', 'EXPLICIT', 'EXPORTS', 'EXTENSIBILITY', 'EXTERNAL', 'FALSE', 'FROM', 'GeneralizedTime', 'GeneralString', 'GraphicString', 'IA5String', 'IDENTIFIER', 'IMPLICIT', 'IMPLIED', 'IMPORTS', 'INCLUDES', 'INSTANCE', 'INTEGER', 'INTERSECTION', 'ISO646String', 'MAX', 'MIN', 'MINUS-INFINITY', 'NULL', 'NumericString', 'OBJECT', 'ObjectDescriptor', 'OCTET', 'OF', 'OPTIONAL', 'PATTERN', 'PDV', 'PLUS-INFINITY', 'PRESENT', 'PrintableString', 'PRIVATE', 'REAL', 'RELATIVE-OID', 'SEQUENCE', 'SET', 'SIZE', 'STRING', 'SYNTAX', 'T61String', 'TAGS', 'TeletexString', 'TRUE', 'TYPE-IDENTIFIER', 'UNION', 'UNIQUE', 'UNIVERSAL', 'UniversalString', 'UTCTime', 'UTF8String', 'VideotexString', 'VisibleString', 'WITH') _RE_KEYWORDS = '|'.join(SYNT_KEYWORDS) # list of all ASN.1 basic types, constructed types and class # WNG: OPEN_TYPE is a custom internal identifier # WNG: INSTANCE OF is handled as a native type since it has a specific syntax SYNT_NATIVE_TYPES = ( 'BOOLEAN', 'NULL', 'INTEGER', 'ENUMERATED', 'REAL', 'BIT STRING', 'OCTET STRING', 'OBJECT IDENTIFIER', 'RELATIVE-OID', 'NumericString', 'PrintableString', 'VisibleString', 'ISO646String', 'IA5String', 'TeletexString', 'T61String', 'VideotexString', 'GraphicString', 'GeneralString', 'UniversalString', 'BMPString', 'UTF8String', 'ObjectDescriptor', 'GeneralizedTime', 'UTCTime', 'SEQUENCE', 'SEQUENCE OF', 'SET', 'SET OF', 'CHOICE', 'EXTERNAL', 'EMBEDDED PDV', 'CHARACTER STRING', 'ANY', 'OPEN_TYPE', 'CLASS', 'TYPE-IDENTIFIER', 'ABSTRACT-SYNTAX', 'INSTANCE OF') _RE_NATIVE_TYPES = '|'.join(SYNT_NATIVE_TYPES) # list of all ASN.1 keywords that cannot be used in a WITH SYNTAX statement SYNT_SYNTAX_BL = ( 'BIT', 'BOOLEAN', 'CHARACTER', 'CHOICE', 'EMBEDDED', 'END', 'ENUMERATED', 'EXTERNAL', 'FALSE', 'INSTANCE', 'INTEGER', 'INTERSECTION', 'MINUS-INFINITY', 'NULL', 'OBJECT', 'OCTET', 'PLUS-INFINITY', 'REAL', 'RELATIVE-OID', 'SEQUENCE', 'SET', 'TRUE', 'UNION') # basic ASN.1 tokens _RE_INTEGER = '(?:\-{0,1}0{1})|(?:\-{0,1}[1-9]{1}[0-9]{0,})' _RE_INTEGER_POS = '(?:\-{0,1}0{1})|(?:[1-9]{1}[0-9]{0,})' _RE_IDENT = '[a-z]{1,}[a-zA-Z0-9\-]{0,}' _RE_TYPEREF = '[A-Z]{1,}[a-zA-Z0-9\-]{0,}' _RE_CLASSREF = '[A-Z]{1,}[A-Z0-9\-]{0,}' _RE_WORD = '[a-zA-Z]{1,}[a-zA-Z0-9\-]{0,}' # ASN.1 names SYNT_RE_WORD = re.compile( '(?:^|\s{1})(%s)' % _RE_WORD) SYNT_RE_IDENT = re.compile( '(?:^|\s{1})(%s)' % _RE_IDENT) SYNT_RE_TYPE = re.compile( '(?:^|\s{1})(%s)(?:$|[^0-9^a-z^A-Z^\-]{1,})' % _RE_NATIVE_TYPES) SYNT_RE_TYPEREF = re.compile( '(?:^|\s{1})(%s)' % _RE_TYPEREF) SYNT_RE_CLASSREF = re.compile( '(?:^|\s{1})(%s)' % _RE_CLASSREF) SYNT_RE_CLASSFIELDIDENT = re.compile( '(?:^|\s{1})\&([a-zA-Z0-9\-]{1,})') SYNT_RE_CLASSFIELDREF = re.compile( '(?:^|\s{1})((%s)\s{0,1}\.\&([a-zA-Z0-9\-]{1,}))' % _RE_CLASSREF) SYNT_RE_CLASSFIELDREFINT = re.compile( '(?:^|\s{1})\&(%s)' % _RE_TYPEREF) SYNT_RE_CLASSVALREF = re.compile( '(?:^|\s{1})((%s)\s{0,1}\.\&([a-zA-Z0-9\-]{1,}))' % _RE_IDENT) SYNT_RE_CLASSINSTFIELDREF = re.compile( '(?:^|\s{1})(%s)(?:\s{0,1}\.\&(%s)){0,}' % (_RE_WORD, _RE_WORD)) SYNT_RE_IDENTEXT = re.compile( '(?:^|\s{1})((%s)\.(%s))' % (_RE_TYPEREF, _RE_IDENT)) # WNG: SYNT_RE_TYPEREF matches also SYNT_RE_CLASSREF # ASN.1 expressions SYNT_RE_MODULEDEF = re.compile( '\s{1,}(DEFINITIONS)\s{1,}') SYNT_RE_MODULEREF = re.compile( '(?:^|\s{1})(%s){1}\s{0,}(\{[\s\-a-zA-Z0-9\(\)]{1,}\}){0,1}' % _RE_TYPEREF) SYNT_RE_MODULEFROM = re.compile( '(?:FROM\s{1,})(%s)\s*' % _RE_TYPEREF) SYNT_RE_MODULEFROM_SYM = re.compile( '(%s)(?:\s*\{\s*\}){0,1}(?:\s*,|\s{1,}FROM)' % _RE_WORD) SYNT_RE_MODULEFROM_OID = re.compile( '(%s)\s*|(\{[a-zA-Z0-9\(\)\-\s]{4,}\})\s*' % _RE_IDENT) SYNT_RE_MODULEFROM_WIT = re.compile( 'WITH\s{1,}(SUCCESSORS|DESCENDANTS)\s*') SYNT_RE_MODULEEXP = re.compile( '(?:^|\s{1})EXPORTS((.|\n)*?);') SYNT_RE_MODULEIMP = re.compile( '(?:^|\s{1})IMPORTS((.|\n)*?);') SYNT_RE_MODULEOPT = re.compile( '(?:^|\s{1})(EXPLICIT\s{1,}TAGS|IMPLICIT\s{1,}TAGS|AUTOMATIC\s{1,}TAGS)') SYNT_RE_MODULEEXT = re.compile( '(?:^|\s{1})(EXTENSIBILITY\s{1,}IMPLIED)') SYNT_RE_TAG = re.compile( '\[\s{0,}(UNIVERSAL|APPLICATION|PRIVATE){0,1}\s{0,}(?:(%s)|(%s))\s{0,}\]' \ % (_RE_INTEGER_POS, _RE_IDENT)) SYNT_RE_PARAM = re.compile( '(%s)(?:\s{0,}\:\s{0,}(%s|%s)){0,1}' \ % (_RE_TYPEREF, _RE_IDENT, _RE_TYPEREF)) SYNT_RE_SIZEOF = re.compile( '(\({0,1}\s{0,}SIZE)|(OF)') SYNT_RE_INT_ID = re.compile( '(%s)\s{0,}\(\s{0,}((%s)|(%s))\s{0,}\)' \ % (_RE_IDENT, _RE_INTEGER, _RE_IDENT)) SYNT_RE_ENUM = re.compile( '(%s|\.{3})\s{0,}(?:\(\s{0,}((%s)|(%s))\s{0,}\)){0,1}' \ % (_RE_IDENT, _RE_INTEGER, _RE_IDENT)) SYNT_RE_OID_COMP = re.compile( '(%s)|((%s)\s{0,}(?:\((%s)\)){0,1})' \ % (_RE_INTEGER_POS, _RE_IDENT, _RE_INTEGER_POS)) SYNT_RE_CLASSSYNTAX = re.compile( '(?:^|\s{1})((\[)|(\])|([A-Z\-]{1,})|(\&([a-zA-Z0-9\-]{1,})))') SYNT_RE_CHOICEALT = re.compile( '(?:^|\s{1})(?:(%s)(?:\s{0,}<\s{0,})){1,}(%s)' % (_RE_IDENT, _RE_TYPEREF)) SYNT_RE_INTVAL = re.compile( '(?:^|\s{1})(\-{0,1}[0-9]{1,})') SYNT_RE_BSTRING = re.compile( '(?:^|\s{1})\'([\s01]{0,})\'B') SYNT_RE_HSTRING = re.compile( '(?:^|\s{1})\'([\s0-9A-F]{0,})\'H') SYNT_RE_REALNUM = re.compile( '(?:^|\s{1})' \ '(\-{0,1}[0-9]{1,}){1}' \ '(?:\.([0-9]{0,})){0,1}' \ '(?:[eE](\-{0,1}[0-9]{1,})){0,1}') SYNT_RE_REALSEQ = re.compile( '(?:^|\s{1})' \ '(?:\{\s{0,}mantissa\s{1,}(\-{0,1}[0-9]{1,})\s{0,},' \ '\s{0,}base\s{1,}(2|10)\s{0,},' \ '\s{0,}exponent\s{1,}(\-{0,1}[0-9]{1,})\s{0,}\})') SYNT_RE_REALSPEC = re.compile( '(?:^|\s{1})((?:PLUS\-INFINITY)|(?:MINUS\-INFINITY)|(?:NOT-A-NUMBER))') SYNT_RE_UNIVSTR = re.compile( '(?:^|\s{1})(?:\{\s{0,}'\ '([0-9]{1,3})\s{0,},\s{0,}([0-9]{1,3})\s{0,},\s{0,}'\ '([0-9]{1,3})\s{0,},\s{0,}([0-9]{1,3})\s{0,}\})') SYNT_RE_TIMEUTC = re.compile( '(?:^|\s{1})' \ '"([0-9]{2})([0-9]{2})([0-9]{2})' \ '([0-9]{2})([0-9]{2})([0-9]{2}){0,1}' \ '((?:Z)|(?:[+-]{1}[0-9]{4}))"') SYNT_RE_TIMEGENE = re.compile( '(?:^|\s{1})' \ '"([0-9]{4})([0-9]{2})([0-9]{2})([0-9]{2})' \ '(?:([0-9]{2})([0-9]{2}){0,1}){0,1}' \ '(?:(?:\.|,)([0-9]{1,})){0,1}' \ '((?:Z)|(?:[+-](?:[0-9]{2}){0,2})){0,1}"') SYNT_RE_CONST_DISPATCH = re.compile( '(?:^|\s{1})(INCLUDES)|(SIZE)|(FROM)|(WITH COMPONENTS)|(WITH COMPONENT)|' \ '(PATTERN)|(SETTINGS)|(CONTAINING)|(ENCODED BY)|(CONSTRAINED BY)') SYNT_RE_CONST_EXT = re.compile( ',\s{0,}\.\.\.') SYNT_RE_GROUPVERS = re.compile( '(?:^|\s{1})[0-9]{1,}\s{0,1}\:') def match_typeref(text): m = SYNT_RE_TYPEREF.match(text) if not m: return None else: # ensure the match does not correspond to an ASN.1 keyword if m.group() in SYNT_KEYWORDS: return None else: return m # ------------------------------------------------------------------------------# # text processing routines # ------------------------------------------------------------------------------# def strip(text=''): return text.strip() def name_to_defin(n): if iskeyword(n): # n is a Python keyword n += '_' return n.replace('-', '_').replace(' ', '_') def scan_for_comments(text=''): """ returns a list of 2-tuple (start offset, end offset) for each ASN.1 comment found in text """ ret = [] cur = 0 next = text.find('--') while next >= 0: cur += next # start of comment start = cur # move cursor forward to reach the end of comment cur += 2 # exception for line full of ------------------ sh*t while text[cur:1+cur] == '-': cur += 1 while True: # move 1 by 1 if text[cur:1+cur] == '\n' or cur >= len(text): # end-of-line or end-of-file ret.append((start, cur)) cur += 1 break elif text[cur:2+cur] == '--': # end-of-comment cur += 2 ret.append((start, cur)) break else: cur += 1 # find the next comment next = text[cur:].find('--') return ret def scan_for_comments_cstyle(text=''): """ returns a list of 2-tuple (start offset, end offset) for each ASN.1 comment in C-style found in text """ ret = [] cur = 0 next = text.find('/*') while next >= 0: cur += next # start of comment start = cur # move cursor forward to reach the end of comment cur += 2 while True: # move 1 by 1 and find an end-of-comment or end-of-file if cur >= len(text): # end-of-file ret.append((start, cur)) break elif text[cur:2+cur] == '*/': # end-of-comment cur += 2 ret.append((start, cur)) break else: cur += 1 # find the next comment next = text[cur:].find('/*') return ret def clean_text(text=''): """ processes text to: remove ASN.1 comments replace tab with space remove duplicated spaces """ # WARNING: this routine for text cleanup, as it is applied early in the text # processing, may mess up ASN.1 string values # # remove comments comments = scan_for_comments(text) if comments: # get the complementary text to comments, to get the text containing # the actual definitions start, defins = 0, [] for (so, eo) in comments: defins.append( text[start:so] ) start = eo defins.append( text[start:len(text)] ) text = ''.join(defins) # # remove C-style comments comments = scan_for_comments_cstyle(text) if comments: start, defins = 0, [] for (so, eo) in comments: defins.append( text[start:so] ) start = eo defins.append( text[start:len(text)] ) text = ''.join(defins) # # replace tab with space text = text.replace('\t', ' ') # remove duplicated CR text = re.sub('\n{2,}', '\n', text) # remove duplicated spaces text = re.sub(' {2,}', ' ', text) # return text def search_top_lvl_sep(text='', sep=','): """ returns a list of offsets for each top-level separator `sep' found in the text """ ret = [] # count = {'(': 0, ')': 0, '{': 0, '}': 0, '[': 0, ']': 0} _is_top_level = lambda c: c['('] == c[')'] and c['{'] == c['}'] and c['['] == c[']'] # for cur in range(len(text)): if text[cur] in count: count[text[cur]] += 1 if text[cur] == sep and _is_top_level(count): ret.append(cur) return ret def search_top_lvl_off(text=''): """ returns the offsets in the text corresponding to the top level (outside of any parenthesis / bracket / curlybracket groups) """ # {1, 2, {3, True}} DEFAULT (1, 2) UNIQUE off = [[0]] # count = {'(': 0, ')': 0, '{': 0, '}': 0, '[': 0, ']': 0} _is_top_level = lambda c: c['('] == c[')'] and c['{'] == c['}'] and c['['] == c[']'] # top_level = True for cur in range(len(text)): char = text[cur] if char in count: count[char] += 1 if top_level and not _is_top_level(count): # transition to inner group: closing the top-level boundary off[-1].append(cur) top_level = False elif not top_level and _is_top_level(count): # transition to top level: opening a top-level boundary off.append([cur + 1]) top_level = True # end of text if top_level: off[-1].append(len(text) + 1) else: # error ? del off[-1] # some clean-up if off[0] == [0, 0]: del off[0] return off def search_between(text='', ins='{', outs='}'): """ returns a list of 2-tuple for each top level part of the text in-between `ins' and `outs' expression """ # TODO: look for character string, defined between double-quotes ", # and do not evaluate matching character inside them # if len(ins) != len(outs): raise(ASN1Err('requires identical length for ins and outs')) lens = len(ins) # ret = [] # count = {ins: 0, outs: 0} entered = False # for cur in range(len(text)): if not entered and text[cur:cur + lens] == ins: # passing initial ins char entered = True start = cur if text[cur:cur + lens] in count: # counting ins / outs chars count[text[cur:cur + lens]] += 1 if entered and count[ins] == count[outs]: # passing last outs char stop = cur + lens ret.append((start, stop)) entered = False return ret def extract_curlybrack(text=''): """ extracts the part of text between "{" and "}" if the "{" is at the start of the string returns the remaining text, and the extracted content or None """ text = text.strip() offsets = search_between(text, '{', '}') if not offsets: return text, None offsets = offsets[0] if offsets[0] != 0: return text, None return text[offsets[1]:].strip(), text[1:offsets[1] - 1].strip() def extract_parenth(text=''): """ extracts the part of text between "(" and ")" if the "(" is at the start of the string returns the remaining text, and the extracted content or None """ text = text.strip() offsets = search_between(text, '(', ')') if not offsets: return text, None offsets = offsets[0] if offsets[0] != 0: return text, None return text[offsets[1]:].strip(), text[1:offsets[1] - 1].strip() def extract_brack(text=''): """ extracts the part of text between "[" and "]" if the "[" is at the start of the string returns the remaining text, and the extracted content or None """ text = text.strip() offsets = search_between(text, '[', ']') if not offsets: return text, None offsets = offsets[0] if offsets[0] != 0: return text, None return text[offsets[1]:].strip(), text[1:offsets[1] - 1].strip() def extract_doublebrack(text=''): """ extracts the part of text between "[[" and "]]" if the "[[" is at the start of the string returns the remaining text, and the extracted content or None """ text = text.strip() offsets = search_between(text, '[[', ']]') if not offsets: return text, None offsets = offsets[0] if offsets[0] != 0: return text, None return text[offsets[1]:].strip(), text[2:offsets[1] - 2].strip() def extract_charstr(text=''): """ extracts the part of text between double-quote ", escaping doubled double-quotes, and removing newline groups returns the remaining text, and the extracted content or None """ text = text.strip() if text[0:1] != '"': return text, None elif len(text) == 1: return text, None # esc = 0 for cur in range(1, len(text)): # 1) end of text if cur == len(text) - 1: if text[cur:1+cur] != '"': # no end-of-charstr found return text, None else: return '', re.subn('\s{0,}\n\s{0,}', '', text[1:-1])[0] # 2) finding a double-quote if text[cur:1+cur] == '"': if esc > 0: # 2.1) escape cursor already set if cur == esc: # current double-quote escaped, unsetting escape cursor esc = 0 else: # current double-quote not escaped if text[1+cur:2+cur] == '"': # escaping next char esc = 1+cur else: # end of charstr return text[1+cur:].strip(), \ re.subn('\s{0,}\n\s{0,}', '', text[1:cur])[0] else: # 2.2) escape cursor not set if text[1+cur:2+cur] == '"': # escaping next char esc = 1+cur else: # end of charstr return text[1+cur:].strip(), \ re.subn('\s{0,}\n\s{0,}', '', text[1:cur])[0] def extract_multi(text=''): """ extracts the list of textual components between curly-brackets returns the remaining text, and the list of extracted textual components """ # e.g. { comp1, comp2, comp3 } rest, text = extract_curlybrack(text) if not text: return rest, text else: # split each coma-separated field coma_offsets = [-1] + search_top_lvl_sep(text, ',') + [len(text)] return rest, list(map(strip, [text[coma_offsets[i] + 1:coma_offsets[i + 1]] \ for i in range(len(coma_offsets) - 1)])) def extract_set(text=''): """ extracts the list of root and extended textual components, each component being separated with "|", and root and extension being separated with commas and "..." taking care of character strings definition between double-quotes " returns a dict with root and ext keys and corresponding strings """ # 1) we go char by char with a state machine, looking for: # 1) unescaped double-quote " # 2) or separator | # 3) coma , # text = text.strip() # # list the set of group of values # the current group of values # the current list of chars valset = [] valgrp = [] value = [] # # state that says if we are in a charstr, between " or not # we do not evaluate escaped double-quotes especially, # as it is like we are leaving and reentering the charstr state charstr = False # # state that says if we are inside any inner set inside the given set, # between { and } or not innerset = 0 # # go char by char for char in text: value.append(char) if char == '"': if charstr: charstr = False else: charstr = True else: if not charstr: if char == '{': innerset += 1 elif char == '}': innerset -= 1 if innerset < 0: raise(ASN1Err('extract_set, invalid number of closing curlybrackets'\ .format(text))) if innerset == 0: if char == '|': valgrp.append( ''.join(value[:-1]).strip() ) value = [] elif char == ',': valgrp.append( ''.join(value[:-1]).strip() ) value = [] valset.append( valgrp ) valgrp = [] if value: valgrp.append( ''.join(value).strip() ) if valgrp: valset.append( valgrp ) # # 2) we evaluate the list of groups found and the potential extensibility # marker in between, and build the resulting root / ext dict # if len(valset) == 0: return {'root': [], 'ext': None} elif len(valset) == 1: if valset[0] == ['...']: return {'root': [], 'ext': []} else: return {'root': valset[0], 'ext': None} elif len(valset) == 2: if valset[0] == ['...']: return {'root': [], 'ext': valset[1]} else: if valset[1] != ['...']: raise(ASN1Err('extract_set, invalid coma-separated groups, {0!r}'\ .format(valset))) return {'root': valset[0], 'ext': []} elif len(valset) == 3: if valset[1] != ['...']: raise(ASN1Err('extract_set, invalid coma-separated groups, {0!r}'\ .format(valset))) return {'root': valset[0], 'ext': valset[2]} else: raise(ASN1Err('extract_set, invalid coma-separated groups, {0!r}'\ .format(valset))) def extract_from_import(text=''): """ extracts the module name, reference and / or OID set after a FROM import statement, test `text` argument must start with the FROM keyword returns a 2-tuple with integer: length of the text containing the whole FROM statement dict: with "name", "oid", "oidref" and "with" keys """ m = SYNT_RE_MODULEFROM.match(text) assert(m) cur = m.end() ret = {'name': m.group(1), 'oid': None, 'oidref': None, 'with': None} # check if we stop or continue with an OID value or OID reference if SYNT_RE_MODULEFROM_SYM.match(text[cur:]) or not text[cur:]: return cur, ret m = SYNT_RE_MODULEFROM_OID.match(text[cur:]) assert(m) cur += m.end() assert(None in m.groups()) if m.group(1): ret['oidref'] = m.group(1) else: ret['oid'] = m.group(2) # check if there is a final WITH stmt m = SYNT_RE_MODULEFROM_WIT.match(text[cur:]) if m: ret['with'] = m.group(1) cur += m.end() # final control assert(SYNT_RE_MODULEFROM_SYM.match(text[cur:]) or not text[cur:]) return cur, ret