pycrate/pycrate_asn1c/utils.py

893 lines
29 KiB
Python

# -*- coding: UTF-8 -*-
# /**
# * Software Name : pycrate
# * Version : 0.4
# *
# * Copyright 2016. Benoit Michau. ANSSI.
# *
# * This library is free software; you can redistribute it and/or
# * modify it under the terms of the GNU Lesser General Public
# * License as published by the Free Software Foundation; either
# * version 2.1 of the License, or (at your option) any later version.
# *
# * This library is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# * Lesser General Public License for more details.
# *
# * You should have received a copy of the GNU Lesser General Public
# * License along with this library; if not, write to the Free Software
# * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# * MA 02110-1301 USA
# *
# *--------------------------------------------------------
# * File Name : pycrate_asn1c/utils.py
# * Created : 2016-03-02
# * Authors : Benoit Michau
# *--------------------------------------------------------
# */
import re
import pprint
from keyword import iskeyword
# pycrate_core is used only for basic library-wide functions / variables:
# log(), python_version, integer_types, str_types
from pycrate_core.utils import *
from .err import ASN1Err
# ------------------------------------------------------------------------------#
# asn1-wide Python routines
# ------------------------------------------------------------------------------#
def asnlog(msg):
"""
customizable logging function for the whole asn1 part
"""
log(msg)
_PP = pprint.PrettyPrinter(indent=1, width=80, depth=None, stream=None)
def pprint(obj):
return _PP.pprint(obj)
def pformat(obj):
return _PP.pformat(obj)
# ------------------------------------------------------------------------------#
# asn1-wide Python variables and identifiers
# ------------------------------------------------------------------------------#
# list of ASN.1 OIDs required to be "known" by the compiler
ASN1_OID_ISO = {
('itu-t',): 0,
('ccitt',): 0,
(0, 'recommendation'): 0,
(0, 0, 'a'): 1,
(0, 0, 'b'): 2,
(0, 0, 'c'): 3,
(0, 0, 'd'): 4,
(0, 0, 'e'): 5,
(0, 0, 'f'): 6,
(0, 0, 'g'): 7,
(0, 0, 'h'): 8,
(0, 0, 'i'): 9,
(0, 0, 'j'): 10,
(0, 0, 'k'): 11,
(0, 0, 'l'): 12,
(0, 0, 'm'): 13,
(0, 0, 'n'): 14,
(0, 0, 'o'): 15,
(0, 0, 'p'): 16,
(0, 0, 'q'): 17,
(0, 0, 'r'): 18,
(0, 0, 's'): 19,
(0, 0, 't'): 20,
(0, 0, 'u'): 21,
(0, 0, 'v'): 22,
(0, 0, 'w'): 23,
(0, 0, 'x'): 24,
(0, 0, 'y'): 25,
(0, 0, 'z'): 26,
(0, 'question'): 1,
(0, 'administration'): 2,
(0, 'network-operator'): 3,
(0, 'identified-organization'): 4,
('iso',): 1,
(1, 'standard'): 0,
(1, 'registration-authority'): 1,
(1, 'member-body'): 2,
(1, 2, 'f'): 250,
(1, 'identified-organization'): 3,
('joint-iso-itu-t',): 2,
('joint-iso-ccitt',): 2,
(2, 'asn1'): 1,
(2, 1, 'basic-encoding'): 1,
(2, 1, 'ber-derived'): 2,
(2, 1, 'packed-encoding'): 3,
(2, 'mhs-motif'): 6,
(2, 'ms'): 9,
(2, 'registration-procedures'): 17,
}
###
# DO NOT CHANGE the following identifiers
# as many of them correspond directly to the ASN.1 syntax
###
# ASN.1 object mode
MODE_VALUE = 'VALUE'
MODE_SET = 'SET'
MODE_TYPE = 'TYPE'
# ASN.1 type identifiers
# basic types
TYPE_NULL = 'NULL'
TYPE_BOOL = 'BOOLEAN'
TYPE_INT = 'INTEGER'
TYPE_REAL = 'REAL'
TYPE_ENUM = 'ENUMERATED'
TYPE_BIT_STR = 'BIT STRING'
TYPE_OCT_STR = 'OCTET STRING'
TYPE_OID = 'OBJECT IDENTIFIER'
TYPE_REL_OID = 'RELATIVE-OID'
# string types
TYPE_STR_IA5 = 'IA5String'
TYPE_STR_PRINT = 'PrintableString'
TYPE_STR_NUM = 'NumericString'
TYPE_STR_VIS = 'VisibleString'
TYPE_STR_BMP = 'BMPString'
TYPE_STR_UTF8 = 'UTF8String'
TYPE_STR_ISO646 = 'ISO646String'
TYPE_STR_TELE = 'TeletexString'
TYPE_STR_VID = 'VideotexString'
TYPE_STR_GRAPH = 'GraphicString'
TYPE_STR_T61 = 'T61String'
TYPE_STR_GENE = 'GeneralString'
TYPE_STR_UNIV = 'UniversalString'
TYPE_OBJ_DESC = 'ObjectDescriptor'
# time types
TYPE_TIME_GEN = 'GeneralizedTime'
TYPE_TIME_UTC = 'UTCTime'
# constructed types
TYPE_CHOICE = 'CHOICE'
TYPE_SEQ = 'SEQUENCE'
TYPE_SEQ_OF = 'SEQUENCE OF'
TYPE_SET = 'SET'
TYPE_SET_OF = 'SET OF'
# wrapper types
TYPE_OPEN = 'OPEN_TYPE'
TYPE_ANY = 'ANY'
TYPE_EXT = 'EXTERNAL'
TYPE_EMB_PDV = 'EMBEDDED PDV'
TYPE_CHAR_STR = 'CHARACTER STRING'
# info object
TYPE_CLASS = 'CLASS'
TYPE_TYPEIDENT = 'TYPE-IDENTIFIER'
TYPE_ABSSYNT = 'ABSTRACT-SYNTAX'
TYPE_INSTOF = 'INSTANCE OF'
# string types
TYPE_STRINGS = (TYPE_STR_IA5, TYPE_STR_PRINT, TYPE_STR_NUM, TYPE_STR_VIS,
TYPE_STR_BMP, TYPE_STR_UTF8, TYPE_STR_ISO646, TYPE_STR_TELE,
TYPE_STR_VID, TYPE_STR_GRAPH, TYPE_STR_T61, TYPE_STR_GENE,
TYPE_STR_UNIV, TYPE_OBJ_DESC)
# types with constructed content
TYPE_CONSTRUCT = (TYPE_SEQ_OF, TYPE_SET_OF,
TYPE_CHOICE, TYPE_SEQ, TYPE_SET,
TYPE_CLASS,
TYPE_REAL, TYPE_EXT, TYPE_EMB_PDV)
# types with potential SIZE constraint
TYPE_CONST_SIZE = (TYPE_BIT_STR, TYPE_OCT_STR,
TYPE_STR_IA5, TYPE_STR_PRINT, TYPE_STR_NUM, TYPE_STR_VIS,
TYPE_STR_BMP, TYPE_STR_UTF8, TYPE_STR_ISO646, TYPE_STR_TELE,
TYPE_STR_VID, TYPE_STR_GRAPH, TYPE_STR_T61, TYPE_STR_GENE,
TYPE_STR_UNIV, TYPE_OBJ_DESC,
TYPE_SEQ_OF, TYPE_SET_OF,
TYPE_CHAR_STR)
# ASN.1 tag identifers
TAG_IMPLICIT = 'IMPLICIT'
TAG_EXPLICIT = 'EXPLICIT'
TAG_AUTO = 'AUTOMATIC'
TAG_CONTEXT_SPEC = 'CONTEXT-SPECIFIC'
TAG_PRIVATE = 'PRIVATE'
TAG_APPLICATION = 'APPLICATION'
TAG_UNIVERSAL = 'UNIVERSAL'
# ASN.1 tag class canonical orderding
TAG_CANON_ORDER = {
TAG_UNIVERSAL: 0,
TAG_APPLICATION: 1,
TAG_CONTEXT_SPEC: 2,
TAG_PRIVATE: 3
}
# constraints supported for types
CONST_VAL = 'VAL'
# keys: 'root': list,
# 'ext' : None or list
CONST_SIZE = 'SIZE'
# keys: 'root': list (of integer),
# 'ext' : None or list
CONST_CONTAINING = 'CONTAINING'
# keys: 'obj' : ASN1Obj,
# 'enc' : None or OID value
CONST_ALPHABET = 'ALPHABET'
# keys: 'root': list (of chars),
# 'ext' : None or list
CONST_COMPS = 'WITH COMPONENTS'
# keys: 'root': list
# 'ext': None or list
# each component of the root / ext list is a
# dict {'_abs' : list of absent ident,
# '_pre' : list of present idents,
# '$ident': {'const': [list of additional constraints for $ident]}
# constraints supported for CLASS
CONST_TABLE = 'TABLE'
# keys: 'tab': CLASS set object gathering all root / ext values
# 'at': str or None,
# 'exc': str or None
# constraints extacted but not supported at runtime
CONST_COMP = 'WITH COMPONENT'
# keys: none
CONST_ENCODE_BY = 'ENCODE BY'
# keys: None
CONST_REGEXP = 'PATTERN'
# keys: None
CONST_CONSTRAIN_BY = 'CONSTRAINED BY'
# keys: None
CONST_PROPERTY = 'SETTINGS'
# keys: none
# specific flags for constructed types components and CLASS type fields
FLAG_OPT = 'OPTIONAL'
FLAG_UNIQ = 'UNIQUE'
FLAG_DEF = 'DEFAULT'
FLAG_DEFBY = 'DEFINED BY'
# ------------------------------------------------------------------------------#
# regexp for processing ASN.1 text
# ------------------------------------------------------------------------------#
# list of all ASN.1 keywords
SYNT_KEYWORDS = (
'ABSENT', 'ABSTRACT-SYNTAX', 'ALL', 'APPLICATION', 'AUTOMATIC', 'BEGIN',
'BIT', 'BMPString', 'BOOLEAN', 'BY', 'CHARACTER', 'CHOICE', 'CLASS', 'COMPONENT',
'COMPONENTS', 'CONSTRAINED', 'CONTAINING', 'DEFAULT', 'DEFINITIONS', 'EMBEDDED',
'ENCODED', 'END', 'ENUMERATED', 'EXCEPT', 'EXPLICIT', 'EXPORTS', 'EXTENSIBILITY',
'EXTERNAL', 'FALSE', 'FROM', 'GeneralizedTime', 'GeneralString', 'GraphicString',
'IA5String', 'IDENTIFIER', 'IMPLICIT', 'IMPLIED', 'IMPORTS', 'INCLUDES', 'INSTANCE',
'INTEGER', 'INTERSECTION', 'ISO646String', 'MAX', 'MIN', 'MINUS-INFINITY',
'NULL', 'NumericString', 'OBJECT', 'ObjectDescriptor', 'OCTET', 'OF', 'OPTIONAL',
'PATTERN', 'PDV', 'PLUS-INFINITY', 'PRESENT', 'PrintableString', 'PRIVATE',
'REAL', 'RELATIVE-OID', 'SEQUENCE', 'SET', 'SIZE', 'STRING', 'SYNTAX', 'T61String',
'TAGS', 'TeletexString', 'TRUE', 'TYPE-IDENTIFIER', 'UNION', 'UNIQUE', 'UNIVERSAL',
'UniversalString', 'UTCTime', 'UTF8String', 'VideotexString', 'VisibleString',
'WITH')
_RE_KEYWORDS = '|'.join(SYNT_KEYWORDS)
# list of all ASN.1 basic types, constructed types and class
# WNG: OPEN_TYPE is a custom internal identifier
# WNG: INSTANCE OF is handled as a native type since it has a specific syntax
SYNT_NATIVE_TYPES = (
'BOOLEAN', 'NULL', 'INTEGER', 'ENUMERATED', 'REAL', 'BIT STRING',
'OCTET STRING', 'OBJECT IDENTIFIER', 'RELATIVE-OID',
'NumericString', 'PrintableString', 'VisibleString', 'ISO646String',
'IA5String', 'TeletexString', 'T61String', 'VideotexString', 'GraphicString',
'GeneralString', 'UniversalString', 'BMPString', 'UTF8String',
'ObjectDescriptor', 'GeneralizedTime', 'UTCTime',
'SEQUENCE', 'SEQUENCE OF', 'SET', 'SET OF', 'CHOICE',
'EXTERNAL', 'EMBEDDED PDV', 'CHARACTER STRING',
'ANY', 'OPEN_TYPE',
'CLASS', 'TYPE-IDENTIFIER', 'ABSTRACT-SYNTAX', 'INSTANCE OF')
_RE_NATIVE_TYPES = '|'.join(SYNT_NATIVE_TYPES)
# list of all ASN.1 keywords that cannot be used in a WITH SYNTAX statement
SYNT_SYNTAX_BL = (
'BIT', 'BOOLEAN', 'CHARACTER', 'CHOICE', 'EMBEDDED', 'END', 'ENUMERATED',
'EXTERNAL', 'FALSE', 'INSTANCE', 'INTEGER', 'INTERSECTION', 'MINUS-INFINITY',
'NULL', 'OBJECT', 'OCTET', 'PLUS-INFINITY', 'REAL', 'RELATIVE-OID', 'SEQUENCE',
'SET', 'TRUE', 'UNION')
# basic ASN.1 tokens
_RE_INTEGER = '(?:\-{0,1}0{1})|(?:\-{0,1}[1-9]{1}[0-9]{0,})'
_RE_INTEGER_POS = '(?:\-{0,1}0{1})|(?:[1-9]{1}[0-9]{0,})'
_RE_IDENT = '[a-z]{1,}[a-zA-Z0-9\-]{0,}'
_RE_TYPEREF = '[A-Z]{1,}[a-zA-Z0-9\-]{0,}'
_RE_CLASSREF = '[A-Z]{1,}[A-Z0-9\-]{0,}'
_RE_WORD = '[a-zA-Z]{1,}[a-zA-Z0-9\-]{0,}'
# ASN.1 names
SYNT_RE_WORD = re.compile(
'(?:^|\s{1})(%s)' % _RE_WORD)
SYNT_RE_IDENT = re.compile(
'(?:^|\s{1})(%s)' % _RE_IDENT)
SYNT_RE_TYPE = re.compile(
'(?:^|\s{1})(%s)(?:$|[^0-9^a-z^A-Z^\-]{1,})' % _RE_NATIVE_TYPES)
SYNT_RE_TYPEREF = re.compile(
'(?:^|\s{1})(%s)' % _RE_TYPEREF)
SYNT_RE_CLASSREF = re.compile(
'(?:^|\s{1})(%s)' % _RE_CLASSREF)
SYNT_RE_CLASSFIELDIDENT = re.compile(
'(?:^|\s{1})\&([a-zA-Z0-9\-]{1,})')
SYNT_RE_CLASSFIELDREF = re.compile(
'(?:^|\s{1})((%s)\s{0,1}\.\&([a-zA-Z0-9\-]{1,}))' % _RE_CLASSREF)
SYNT_RE_CLASSFIELDREFINT = re.compile(
'(?:^|\s{1})\&(%s)' % _RE_TYPEREF)
SYNT_RE_CLASSVALREF = re.compile(
'(?:^|\s{1})((%s)\s{0,1}\.\&([a-zA-Z0-9\-]{1,}))' % _RE_IDENT)
SYNT_RE_CLASSINSTFIELDREF = re.compile(
'(?:^|\s{1})(%s)(?:\s{0,1}\.\&(%s)){0,}' % (_RE_WORD, _RE_WORD))
SYNT_RE_IDENTEXT = re.compile(
'(?:^|\s{1})((%s)\.(%s))' % (_RE_TYPEREF, _RE_IDENT))
# WNG: SYNT_RE_TYPEREF matches also SYNT_RE_CLASSREF
# ASN.1 expressions
SYNT_RE_MODULEDEF = re.compile(
'\s{1,}(DEFINITIONS)\s{1,}')
SYNT_RE_MODULEREF = re.compile(
'(?:^|\s{1})(%s){1}\s{0,}(\{[\s\-a-zA-Z0-9\(\)]{1,}\}){0,1}' % _RE_TYPEREF)
SYNT_RE_MODULEFROM = re.compile(
'(?:FROM\s{1,})(%s)\s*' % _RE_TYPEREF)
SYNT_RE_MODULEFROM_SYM = re.compile(
'(%s)(?:\s*\{\s*\}){0,1}(?:\s*,|\s{1,}FROM)' % _RE_WORD)
SYNT_RE_MODULEFROM_OID = re.compile(
'(%s)\s*|(\{[a-zA-Z0-9\(\)\-\s]{4,}\})\s*' % _RE_IDENT)
SYNT_RE_MODULEFROM_WIT = re.compile(
'WITH\s{1,}(SUCCESSORS|DESCENDANTS)\s*')
SYNT_RE_MODULEEXP = re.compile(
'(?:^|\s{1})EXPORTS((.|\n)*?);')
SYNT_RE_MODULEIMP = re.compile(
'(?:^|\s{1})IMPORTS((.|\n)*?);')
SYNT_RE_MODULEOPT = re.compile(
'(?:^|\s{1})(EXPLICIT\s{1,}TAGS|IMPLICIT\s{1,}TAGS|AUTOMATIC\s{1,}TAGS)')
SYNT_RE_MODULEEXT = re.compile(
'(?:^|\s{1})(EXTENSIBILITY\s{1,}IMPLIED)')
SYNT_RE_TAG = re.compile(
'\[\s{0,}(UNIVERSAL|APPLICATION|PRIVATE){0,1}\s{0,}(?:(%s)|(%s))\s{0,}\]' \
% (_RE_INTEGER_POS, _RE_IDENT))
SYNT_RE_PARAM = re.compile(
'(%s)(?:\s{0,}\:\s{0,}(%s|%s)){0,1}' \
% (_RE_TYPEREF, _RE_IDENT, _RE_TYPEREF))
SYNT_RE_SIZEOF = re.compile(
'(\({0,1}\s{0,}SIZE)|(OF)')
SYNT_RE_INT_ID = re.compile(
'(%s)\s{0,}\(\s{0,}((%s)|(%s))\s{0,}\)' \
% (_RE_IDENT, _RE_INTEGER, _RE_IDENT))
SYNT_RE_ENUM = re.compile(
'(%s|\.{3})\s{0,}(?:\(\s{0,}((%s)|(%s))\s{0,}\)){0,1}' \
% (_RE_IDENT, _RE_INTEGER, _RE_IDENT))
SYNT_RE_OID_COMP = re.compile(
'(%s)|((%s)\s{0,}(?:\((%s)\)){0,1})' \
% (_RE_INTEGER_POS, _RE_IDENT, _RE_INTEGER_POS))
SYNT_RE_CLASSSYNTAX = re.compile(
'(?:^|\s{1})((\[)|(\])|([A-Z\-]{1,})|(\&([a-zA-Z0-9\-]{1,})))')
SYNT_RE_CHOICEALT = re.compile(
'(?:^|\s{1})(?:(%s)(?:\s{0,}<\s{0,})){1,}(%s)' % (_RE_IDENT, _RE_TYPEREF))
SYNT_RE_INTVAL = re.compile(
'(?:^|\s{1})(\-{0,1}[0-9]{1,})')
SYNT_RE_BSTRING = re.compile(
'(?:^|\s{1})\'([\s01]{0,})\'B')
SYNT_RE_HSTRING = re.compile(
'(?:^|\s{1})\'([\s0-9A-F]{0,})\'H')
SYNT_RE_REALNUM = re.compile(
'(?:^|\s{1})' \
'(\-{0,1}[0-9]{1,}){1}' \
'(?:\.([0-9]{0,})){0,1}' \
'(?:[eE](\-{0,1}[0-9]{1,})){0,1}')
SYNT_RE_REALSEQ = re.compile(
'(?:^|\s{1})' \
'(?:\{\s{0,}mantissa\s{1,}(\-{0,1}[0-9]{1,})\s{0,},' \
'\s{0,}base\s{1,}(2|10)\s{0,},' \
'\s{0,}exponent\s{1,}(\-{0,1}[0-9]{1,})\s{0,}\})')
SYNT_RE_REALSPEC = re.compile(
'(?:^|\s{1})((?:PLUS\-INFINITY)|(?:MINUS\-INFINITY)|(?:NOT-A-NUMBER))')
SYNT_RE_UNIVSTR = re.compile(
'(?:^|\s{1})(?:\{\s{0,}'\
'([0-9]{1,3})\s{0,},\s{0,}([0-9]{1,3})\s{0,},\s{0,}'\
'([0-9]{1,3})\s{0,},\s{0,}([0-9]{1,3})\s{0,}\})')
SYNT_RE_TIMEUTC = re.compile(
'(?:^|\s{1})' \
'"([0-9]{2})([0-9]{2})([0-9]{2})' \
'([0-9]{2})([0-9]{2})([0-9]{2}){0,1}' \
'((?:Z)|(?:[+-]{1}[0-9]{4}))"')
SYNT_RE_TIMEGENE = re.compile(
'(?:^|\s{1})' \
'"([0-9]{4})([0-9]{2})([0-9]{2})([0-9]{2})' \
'(?:([0-9]{2})([0-9]{2}){0,1}){0,1}' \
'(?:(?:\.|,)([0-9]{1,})){0,1}' \
'((?:Z)|(?:[+-](?:[0-9]{2}){0,2})){0,1}"')
SYNT_RE_CONST_DISPATCH = re.compile(
'(?:^|\s{1})(INCLUDES)|(SIZE)|(FROM)|(WITH COMPONENTS)|(WITH COMPONENT)|' \
'(PATTERN)|(SETTINGS)|(CONTAINING)|(ENCODED BY)|(CONSTRAINED BY)')
SYNT_RE_CONST_EXT = re.compile(
',\s{0,}\.\.\.')
SYNT_RE_GROUPVERS = re.compile(
'(?:^|\s{1})[0-9]{1,}\s{0,1}\:')
def match_typeref(text):
m = SYNT_RE_TYPEREF.match(text)
if not m:
return None
else:
# ensure the match does not correspond to an ASN.1 keyword
if m.group() in SYNT_KEYWORDS:
return None
else:
return m
# ------------------------------------------------------------------------------#
# text processing routines
# ------------------------------------------------------------------------------#
def strip(text=''):
return text.strip()
def name_to_defin(n):
if iskeyword(n):
# n is a Python keyword
n += '_'
return n.replace('-', '_').replace(' ', '_')
def scan_for_comments(text=''):
"""
returns a list of 2-tuple (start offset, end offset) for each ASN.1 comment
found in text
"""
ret = []
cur = 0
next = text.find('--')
while next >= 0:
cur += next
# start of comment
start = cur
# move cursor forward to reach the end of comment
cur += 2
# exception for line full of ------------------ sh*t
while text[cur:1+cur] == '-':
cur += 1
while True:
# move 1 by 1
if text[cur:1+cur] == '\n' or cur >= len(text):
# end-of-line or end-of-file
ret.append((start, cur))
cur += 1
break
elif text[cur:2+cur] == '--':
# end-of-comment
cur += 2
ret.append((start, cur))
break
else:
cur += 1
# find the next comment
next = text[cur:].find('--')
return ret
def scan_for_comments_cstyle(text=''):
"""
returns a list of 2-tuple (start offset, end offset) for each ASN.1 comment
in C-style found in text
"""
ret = []
cur = 0
next = text.find('/*')
while next >= 0:
cur += next
# start of comment
start = cur
# move cursor forward to reach the end of comment
cur += 2
while True:
# move 1 by 1 and find an end-of-comment or end-of-file
if cur >= len(text):
# end-of-file
ret.append((start, cur))
break
elif text[cur:2+cur] == '*/':
# end-of-comment
cur += 2
ret.append((start, cur))
break
else:
cur += 1
# find the next comment
next = text[cur:].find('/*')
return ret
def clean_text(text=''):
"""
processes text to:
remove ASN.1 comments
replace tab with space
remove duplicated spaces
"""
# WARNING: this routine for text cleanup, as it is applied early in the text
# processing, may mess up ASN.1 string values
#
# remove comments
comments = scan_for_comments(text)
if comments:
# get the complementary text to comments, to get the text containing
# the actual definitions
start, defins = 0, []
for (so, eo) in comments:
defins.append( text[start:so] )
start = eo
defins.append( text[start:len(text)] )
text = ''.join(defins)
#
# remove C-style comments
comments = scan_for_comments_cstyle(text)
if comments:
start, defins = 0, []
for (so, eo) in comments:
defins.append( text[start:so] )
start = eo
defins.append( text[start:len(text)] )
text = ''.join(defins)
#
# replace tab with space
text = text.replace('\t', ' ')
# remove duplicated CR
text = re.sub('\n{2,}', '\n', text)
# remove duplicated spaces
text = re.sub(' {2,}', ' ', text)
#
return text
def search_top_lvl_sep(text='', sep=','):
"""
returns a list of offsets for each top-level separator `sep' found in the text
"""
ret = []
#
count = {'(': 0, ')': 0, '{': 0, '}': 0, '[': 0, ']': 0}
_is_top_level = lambda c: c['('] == c[')'] and c['{'] == c['}'] and c['['] == c[']']
#
for cur in range(len(text)):
if text[cur] in count:
count[text[cur]] += 1
if text[cur] == sep and _is_top_level(count):
ret.append(cur)
return ret
def search_top_lvl_off(text=''):
"""
returns the offsets in the text corresponding to the top level
(outside of any parenthesis / bracket / curlybracket groups)
"""
# {1, 2, {3, True}} DEFAULT (1, 2) UNIQUE
off = [[0]]
#
count = {'(': 0, ')': 0, '{': 0, '}': 0, '[': 0, ']': 0}
_is_top_level = lambda c: c['('] == c[')'] and c['{'] == c['}'] and c['['] == c[']']
#
top_level = True
for cur in range(len(text)):
char = text[cur]
if char in count:
count[char] += 1
if top_level and not _is_top_level(count):
# transition to inner group: closing the top-level boundary
off[-1].append(cur)
top_level = False
elif not top_level and _is_top_level(count):
# transition to top level: opening a top-level boundary
off.append([cur + 1])
top_level = True
# end of text
if top_level:
off[-1].append(len(text) + 1)
else:
# error ?
del off[-1]
# some clean-up
if off[0] == [0, 0]:
del off[0]
return off
def search_between(text='', ins='{', outs='}'):
"""
returns a list of 2-tuple for each top level part of the text in-between
`ins' and `outs' expression
"""
# TODO: look for character string, defined between double-quotes ",
# and do not evaluate matching character inside them
#
if len(ins) != len(outs):
raise(ASN1Err('requires identical length for ins and outs'))
lens = len(ins)
#
ret = []
#
count = {ins: 0, outs: 0}
entered = False
#
for cur in range(len(text)):
if not entered and text[cur:cur + lens] == ins:
# passing initial ins char
entered = True
start = cur
if text[cur:cur + lens] in count:
# counting ins / outs chars
count[text[cur:cur + lens]] += 1
if entered and count[ins] == count[outs]:
# passing last outs char
stop = cur + lens
ret.append((start, stop))
entered = False
return ret
def extract_curlybrack(text=''):
"""
extracts the part of text between "{" and "}" if the "{" is at the start
of the string
returns the remaining text, and the extracted content or None
"""
text = text.strip()
offsets = search_between(text, '{', '}')
if not offsets:
return text, None
offsets = offsets[0]
if offsets[0] != 0:
return text, None
return text[offsets[1]:].strip(), text[1:offsets[1] - 1].strip()
def extract_parenth(text=''):
"""
extracts the part of text between "(" and ")" if the "(" is at the start
of the string
returns the remaining text, and the extracted content or None
"""
text = text.strip()
offsets = search_between(text, '(', ')')
if not offsets:
return text, None
offsets = offsets[0]
if offsets[0] != 0:
return text, None
return text[offsets[1]:].strip(), text[1:offsets[1] - 1].strip()
def extract_brack(text=''):
"""
extracts the part of text between "[" and "]" if the "[" is at the start
of the string
returns the remaining text, and the extracted content or None
"""
text = text.strip()
offsets = search_between(text, '[', ']')
if not offsets:
return text, None
offsets = offsets[0]
if offsets[0] != 0:
return text, None
return text[offsets[1]:].strip(), text[1:offsets[1] - 1].strip()
def extract_doublebrack(text=''):
"""
extracts the part of text between "[[" and "]]" if the "[[" is at the start
of the string
returns the remaining text, and the extracted content or None
"""
text = text.strip()
offsets = search_between(text, '[[', ']]')
if not offsets:
return text, None
offsets = offsets[0]
if offsets[0] != 0:
return text, None
return text[offsets[1]:].strip(), text[2:offsets[1] - 2].strip()
def extract_charstr(text=''):
"""
extracts the part of text between double-quote ", escaping doubled
double-quotes, and removing newline groups
returns the remaining text, and the extracted content or None
"""
text = text.strip()
if text[0:1] != '"':
return text, None
elif len(text) == 1:
return text, None
#
esc = 0
for cur in range(1, len(text)):
# 1) end of text
if cur == len(text) - 1:
if text[cur:1+cur] != '"':
# no end-of-charstr found
return text, None
else:
return '', re.subn('\s{0,}\n\s{0,}', '', text[1:-1])[0]
# 2) finding a double-quote
if text[cur:1+cur] == '"':
if esc > 0:
# 2.1) escape cursor already set
if cur == esc:
# current double-quote escaped, unsetting escape cursor
esc = 0
else:
# current double-quote not escaped
if text[1+cur:2+cur] == '"':
# escaping next char
esc = 1+cur
else:
# end of charstr
return text[1+cur:].strip(), \
re.subn('\s{0,}\n\s{0,}', '', text[1:cur])[0]
else:
# 2.2) escape cursor not set
if text[1+cur:2+cur] == '"':
# escaping next char
esc = 1+cur
else:
# end of charstr
return text[1+cur:].strip(), \
re.subn('\s{0,}\n\s{0,}', '', text[1:cur])[0]
def extract_multi(text=''):
"""
extracts the list of textual components between curly-brackets
returns the remaining text, and the list of extracted textual components
"""
# e.g. { comp1, comp2, comp3 }
rest, text = extract_curlybrack(text)
if not text:
return rest, text
else:
# split each coma-separated field
coma_offsets = [-1] + search_top_lvl_sep(text, ',') + [len(text)]
return rest, list(map(strip,
[text[coma_offsets[i] + 1:coma_offsets[i + 1]] \
for i in range(len(coma_offsets) - 1)]))
def extract_set(text=''):
"""
extracts the list of root and extended textual components,
each component being separated with "|",
and root and extension being separated with commas and "..."
taking care of character strings definition between double-quotes "
returns a dict with root and ext keys and corresponding strings
"""
# 1) we go char by char with a state machine, looking for:
# 1) unescaped double-quote "
# 2) or separator |
# 3) coma ,
#
text = text.strip()
#
# list the set of group of values
# the current group of values
# the current list of chars
valset = []
valgrp = []
value = []
#
# state that says if we are in a charstr, between " or not
# we do not evaluate escaped double-quotes especially,
# as it is like we are leaving and reentering the charstr state
charstr = False
#
# state that says if we are inside any inner set inside the given set,
# between { and } or not
innerset = 0
#
# go char by char
for char in text:
value.append(char)
if char == '"':
if charstr:
charstr = False
else:
charstr = True
else:
if not charstr:
if char == '{':
innerset += 1
elif char == '}':
innerset -= 1
if innerset < 0:
raise(ASN1Err('extract_set, invalid number of closing curlybrackets'\
.format(text)))
if innerset == 0:
if char == '|':
valgrp.append( ''.join(value[:-1]).strip() )
value = []
elif char == ',':
valgrp.append( ''.join(value[:-1]).strip() )
value = []
valset.append( valgrp )
valgrp = []
if value:
valgrp.append( ''.join(value).strip() )
if valgrp:
valset.append( valgrp )
#
# 2) we evaluate the list of groups found and the potential extensibility
# marker in between, and build the resulting root / ext dict
#
if len(valset) == 0:
return {'root': [], 'ext': None}
elif len(valset) == 1:
if valset[0] == ['...']:
return {'root': [], 'ext': []}
else:
return {'root': valset[0], 'ext': None}
elif len(valset) == 2:
if valset[0] == ['...']:
return {'root': [], 'ext': valset[1]}
else:
if valset[1] != ['...']:
raise(ASN1Err('extract_set, invalid coma-separated groups, {0!r}'\
.format(valset)))
return {'root': valset[0], 'ext': []}
elif len(valset) == 3:
if valset[1] != ['...']:
raise(ASN1Err('extract_set, invalid coma-separated groups, {0!r}'\
.format(valset)))
return {'root': valset[0], 'ext': valset[2]}
else:
raise(ASN1Err('extract_set, invalid coma-separated groups, {0!r}'\
.format(valset)))
def extract_from_import(text=''):
"""
extracts the module name, reference and / or OID set after a FROM import
statement, test `text` argument must start with the FROM keyword
returns a 2-tuple with
integer: length of the text containing the whole FROM statement
dict: with "name", "oid", "oidref" and "with" keys
"""
m = SYNT_RE_MODULEFROM.match(text)
assert(m)
cur = m.end()
ret = {'name': m.group(1), 'oid': None, 'oidref': None, 'with': None}
# check if we stop or continue with an OID value or OID reference
if SYNT_RE_MODULEFROM_SYM.match(text[cur:]) or not text[cur:]:
return cur, ret
m = SYNT_RE_MODULEFROM_OID.match(text[cur:])
assert(m)
cur += m.end()
assert(None in m.groups())
if m.group(1):
ret['oidref'] = m.group(1)
else:
ret['oid'] = m.group(2)
# check if there is a final WITH stmt
m = SYNT_RE_MODULEFROM_WIT.match(text[cur:])
if m:
ret['with'] = m.group(1)
cur += m.end()
# final control
assert(SYNT_RE_MODULEFROM_SYM.match(text[cur:]) or not text[cur:])
return cur, ret