pycrate/pycrate_asn1c/utils.py

# -*- coding: UTF-8 -*-
# /**
# * Software Name : pycrate
# * Version : 0.4
# *
# * Copyright 2016. Benoit Michau. ANSSI.
# *
# * This library is free software; you can redistribute it and/or
# * modify it under the terms of the GNU Lesser General Public
# * License as published by the Free Software Foundation; either
# * version 2.1 of the License, or (at your option) any later version.
# *
# * This library is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# * Lesser General Public License for more details.
# *
# * You should have received a copy of the GNU Lesser General Public
# * License along with this library; if not, write to the Free Software
# * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# * MA 02110-1301  USA
# *
# *--------------------------------------------------------
# * File Name : pycrate_asn1c/utils.py
# * Created : 2016-03-02
# * Authors : Benoit Michau
# *--------------------------------------------------------
# */

import re
import pprint
from keyword import iskeyword

# pycrate_core is used only for basic library-wide functions / variables:
# log(), python_version, integer_types, str_types
from pycrate_core.utils import *
from .err               import ASN1Err

# ------------------------------------------------------------------------------#
# asn1-wide Python routines
# ------------------------------------------------------------------------------#

def asnlog(msg):
    """
    customizable logging function for the whole asn1 part
    """
    log(msg)


_PP = pprint.PrettyPrinter(indent=1, width=80, depth=None, stream=None)

def pprint(obj):
    return _PP.pprint(obj)

def pformat(obj):
    return _PP.pformat(obj)

# ------------------------------------------------------------------------------#
# asn1-wide Python variables and identifiers
# ------------------------------------------------------------------------------#

# list of ASN.1 OIDs required to be "known" by the compiler
ASN1_OID_ISO = {
    ('itu-t',): 0,
    ('ccitt',): 0,
    (0, 'recommendation'): 0,
    (0, 0, 'a'): 1,
    (0, 0, 'b'): 2,
    (0, 0, 'c'): 3,
    (0, 0, 'd'): 4,
    (0, 0, 'e'): 5,
    (0, 0, 'f'): 6,
    (0, 0, 'g'): 7,
    (0, 0, 'h'): 8,
    (0, 0, 'i'): 9,
    (0, 0, 'j'): 10,
    (0, 0, 'k'): 11,
    (0, 0, 'l'): 12,
    (0, 0, 'm'): 13,
    (0, 0, 'n'): 14,
    (0, 0, 'o'): 15,
    (0, 0, 'p'): 16,
    (0, 0, 'q'): 17,
    (0, 0, 'r'): 18,
    (0, 0, 's'): 19,
    (0, 0, 't'): 20,
    (0, 0, 'u'): 21,
    (0, 0, 'v'): 22,
    (0, 0, 'w'): 23,
    (0, 0, 'x'): 24,
    (0, 0, 'y'): 25,
    (0, 0, 'z'): 26,
    (0, 'question'): 1,
    (0, 'administration'): 2,
    (0, 'network-operator'): 3,
    (0, 'identified-organization'): 4,
    ('iso',): 1,
    (1, 'standard'): 0,
    (1, 'registration-authority'): 1,
    (1, 'member-body'): 2,
    (1, 2, 'f'): 250,
    (1, 'identified-organization'): 3,
    ('joint-iso-itu-t',): 2,
    ('joint-iso-ccitt',): 2,
    (2, 'asn1'): 1,
    (2, 1, 'basic-encoding'): 1,
    (2, 1, 'ber-derived'): 2,
    (2, 1, 'packed-encoding'): 3,
    (2, 'mhs-motif'): 6,
    (2, 'ms'): 9,
    (2, 'registration-procedures'): 17,
}

###
# DO NOT CHANGE the following identifiers
# as many of them correspond directly to the ASN.1 syntax
###

# ASN.1 object mode
MODE_VALUE = 'VALUE'
MODE_SET = 'SET'
MODE_TYPE = 'TYPE'

# ASN.1 type identifiers
# basic types
TYPE_NULL       = 'NULL'
TYPE_BOOL       = 'BOOLEAN'
TYPE_INT        = 'INTEGER'
TYPE_REAL       = 'REAL'
TYPE_ENUM       = 'ENUMERATED'
TYPE_BIT_STR    = 'BIT STRING'
TYPE_OCT_STR    = 'OCTET STRING'
TYPE_OID        = 'OBJECT IDENTIFIER'
TYPE_REL_OID    = 'RELATIVE-OID'
# string types
TYPE_STR_IA5    = 'IA5String'
TYPE_STR_PRINT  = 'PrintableString'
TYPE_STR_NUM    = 'NumericString'
TYPE_STR_VIS    = 'VisibleString'
TYPE_STR_BMP    = 'BMPString'
TYPE_STR_UTF8   = 'UTF8String'
TYPE_STR_ISO646 = 'ISO646String'
TYPE_STR_TELE   = 'TeletexString'
TYPE_STR_VID    = 'VideotexString'
TYPE_STR_GRAPH  = 'GraphicString'
TYPE_STR_T61    = 'T61String'
TYPE_STR_GENE   = 'GeneralString'
TYPE_STR_UNIV   = 'UniversalString'
TYPE_OBJ_DESC   = 'ObjectDescriptor'
# time types
TYPE_TIME_GEN   = 'GeneralizedTime'
TYPE_TIME_UTC   = 'UTCTime'
# constructed types
TYPE_CHOICE     = 'CHOICE'
TYPE_SEQ        = 'SEQUENCE'
TYPE_SEQ_OF     = 'SEQUENCE OF'
TYPE_SET        = 'SET'
TYPE_SET_OF     = 'SET OF'
# wrapper types
TYPE_OPEN       = 'OPEN_TYPE'
TYPE_ANY        = 'ANY'
TYPE_EXT        = 'EXTERNAL'
TYPE_EMB_PDV    = 'EMBEDDED PDV'
TYPE_CHAR_STR   = 'CHARACTER STRING'
# info object
TYPE_CLASS      = 'CLASS'
TYPE_TYPEIDENT  = 'TYPE-IDENTIFIER'
TYPE_ABSSYNT    = 'ABSTRACT-SYNTAX'
TYPE_INSTOF     = 'INSTANCE OF'


# string types
TYPE_STRINGS    = (TYPE_STR_IA5, TYPE_STR_PRINT, TYPE_STR_NUM, TYPE_STR_VIS,
                   TYPE_STR_BMP, TYPE_STR_UTF8, TYPE_STR_ISO646, TYPE_STR_TELE,
                   TYPE_STR_VID, TYPE_STR_GRAPH, TYPE_STR_T61, TYPE_STR_GENE,
                   TYPE_STR_UNIV, TYPE_OBJ_DESC)

# types with constructed content
TYPE_CONSTRUCT  = (TYPE_SEQ_OF, TYPE_SET_OF,
                   TYPE_CHOICE, TYPE_SEQ, TYPE_SET,
                   TYPE_CLASS,
                   TYPE_REAL, TYPE_EXT, TYPE_EMB_PDV)

# types with potential SIZE constraint
TYPE_CONST_SIZE = (TYPE_BIT_STR, TYPE_OCT_STR,
                   TYPE_STR_IA5, TYPE_STR_PRINT, TYPE_STR_NUM, TYPE_STR_VIS,
                   TYPE_STR_BMP, TYPE_STR_UTF8, TYPE_STR_ISO646, TYPE_STR_TELE,
                   TYPE_STR_VID, TYPE_STR_GRAPH, TYPE_STR_T61, TYPE_STR_GENE,
                   TYPE_STR_UNIV, TYPE_OBJ_DESC,
                   TYPE_SEQ_OF, TYPE_SET_OF,
                   TYPE_CHAR_STR)


# ASN.1 tag identifers
TAG_IMPLICIT     = 'IMPLICIT'
TAG_EXPLICIT     = 'EXPLICIT'
TAG_AUTO         = 'AUTOMATIC'
TAG_CONTEXT_SPEC = 'CONTEXT-SPECIFIC'
TAG_PRIVATE      = 'PRIVATE'
TAG_APPLICATION  = 'APPLICATION'
TAG_UNIVERSAL    = 'UNIVERSAL'

# ASN.1 tag class canonical orderding
TAG_CANON_ORDER = {
    TAG_UNIVERSAL: 0,
    TAG_APPLICATION: 1,
    TAG_CONTEXT_SPEC: 2,
    TAG_PRIVATE: 3
    }

# constraints supported for types
CONST_VAL = 'VAL'
# keys: 'root': list,
#       'ext' : None or list
CONST_SIZE = 'SIZE'
# keys: 'root': list (of integer),
#       'ext' : None or list
CONST_CONTAINING = 'CONTAINING'
# keys: 'obj' : ASN1Obj,
#       'enc' : None or OID value
CONST_ALPHABET = 'ALPHABET'
# keys: 'root': list (of chars),
#       'ext' : None or list
CONST_COMPS = 'WITH COMPONENTS'
# keys: 'root': list
#       'ext': None or list
# each component of the root / ext list is a
# dict {'_abs' : list of absent ident,
#       '_pre' : list of present idents,
#       '$ident': {'const': [list of additional constraints for $ident]}
# constraints supported for CLASS
CONST_TABLE = 'TABLE'
# keys: 'tab': CLASS set object gathering all root / ext values
#       'at': str or None,
#       'exc': str or None
# constraints extacted but not supported at runtime
CONST_COMP = 'WITH COMPONENT'
# keys: none
CONST_ENCODE_BY = 'ENCODE BY'
# keys: None
CONST_REGEXP = 'PATTERN'
# keys: None
CONST_CONSTRAIN_BY = 'CONSTRAINED BY'
# keys: None
CONST_PROPERTY = 'SETTINGS'
# keys: none

# specific flags for constructed types components and CLASS type fields
FLAG_OPT   = 'OPTIONAL'
FLAG_UNIQ  = 'UNIQUE'
FLAG_DEF   = 'DEFAULT'
FLAG_DEFBY = 'DEFINED BY'

# ------------------------------------------------------------------------------#
# regexp for processing ASN.1 text
# ------------------------------------------------------------------------------#

# list of all ASN.1 keywords
SYNT_KEYWORDS = (
    'ABSENT', 'ABSTRACT-SYNTAX', 'ALL', 'APPLICATION', 'AUTOMATIC', 'BEGIN',
    'BIT', 'BMPString', 'BOOLEAN', 'BY', 'CHARACTER', 'CHOICE', 'CLASS', 'COMPONENT',
    'COMPONENTS', 'CONSTRAINED', 'CONTAINING', 'DEFAULT', 'DEFINITIONS', 'EMBEDDED',
    'ENCODED', 'END', 'ENUMERATED', 'EXCEPT', 'EXPLICIT', 'EXPORTS', 'EXTENSIBILITY',
    'EXTERNAL', 'FALSE', 'FROM', 'GeneralizedTime', 'GeneralString', 'GraphicString',
    'IA5String', 'IDENTIFIER', 'IMPLICIT', 'IMPLIED', 'IMPORTS', 'INCLUDES', 'INSTANCE',
    'INTEGER', 'INTERSECTION', 'ISO646String', 'MAX', 'MIN', 'MINUS-INFINITY',
    'NULL', 'NumericString', 'OBJECT', 'ObjectDescriptor', 'OCTET', 'OF', 'OPTIONAL',
    'PATTERN', 'PDV', 'PLUS-INFINITY', 'PRESENT', 'PrintableString', 'PRIVATE',
    'REAL', 'RELATIVE-OID', 'SEQUENCE', 'SET', 'SIZE', 'STRING', 'SYNTAX', 'T61String',
    'TAGS', 'TeletexString', 'TRUE', 'TYPE-IDENTIFIER', 'UNION', 'UNIQUE', 'UNIVERSAL',
    'UniversalString', 'UTCTime', 'UTF8String', 'VideotexString', 'VisibleString',
    'WITH')
_RE_KEYWORDS = '|'.join(SYNT_KEYWORDS)

# list of all ASN.1 basic types, constructed types and class
# WNG: OPEN_TYPE is a custom internal identifier
# WNG: INSTANCE OF is handled as a native type since it has a specific syntax
SYNT_NATIVE_TYPES = (
    'BOOLEAN', 'NULL', 'INTEGER', 'ENUMERATED', 'REAL', 'BIT STRING',
    'OCTET STRING', 'OBJECT IDENTIFIER', 'RELATIVE-OID',
    'NumericString', 'PrintableString', 'VisibleString', 'ISO646String',
    'IA5String', 'TeletexString', 'T61String', 'VideotexString', 'GraphicString',
    'GeneralString', 'UniversalString', 'BMPString', 'UTF8String',
    'ObjectDescriptor', 'GeneralizedTime', 'UTCTime',
    'SEQUENCE', 'SEQUENCE OF', 'SET', 'SET OF', 'CHOICE',
    'EXTERNAL', 'EMBEDDED PDV', 'CHARACTER STRING',
    'ANY', 'OPEN_TYPE',
    'CLASS', 'TYPE-IDENTIFIER', 'ABSTRACT-SYNTAX', 'INSTANCE OF')
_RE_NATIVE_TYPES = '|'.join(SYNT_NATIVE_TYPES)

# list of all ASN.1 keywords that cannot be used in a WITH SYNTAX statement
SYNT_SYNTAX_BL = (
    'BIT', 'BOOLEAN', 'CHARACTER', 'CHOICE', 'EMBEDDED', 'END', 'ENUMERATED',
    'EXTERNAL', 'FALSE', 'INSTANCE', 'INTEGER', 'INTERSECTION', 'MINUS-INFINITY',
    'NULL', 'OBJECT', 'OCTET', 'PLUS-INFINITY', 'REAL', 'RELATIVE-OID', 'SEQUENCE',
    'SET', 'TRUE', 'UNION')

# basic ASN.1 tokens
_RE_INTEGER = '(?:\-{0,1}0{1})|(?:\-{0,1}[1-9]{1}[0-9]{0,})'
_RE_INTEGER_POS = '(?:\-{0,1}0{1})|(?:[1-9]{1}[0-9]{0,})'
_RE_IDENT = '[a-z]{1,}[a-zA-Z0-9\-]{0,}'
_RE_TYPEREF = '[A-Z]{1,}[a-zA-Z0-9\-]{0,}'
_RE_CLASSREF = '[A-Z]{1,}[A-Z0-9\-]{0,}'
_RE_WORD = '[a-zA-Z]{1,}[a-zA-Z0-9\-]{0,}'

# ASN.1 names
SYNT_RE_WORD = re.compile(
    '(?:^|\s{1})(%s)' % _RE_WORD)
SYNT_RE_IDENT = re.compile(
    '(?:^|\s{1})(%s)' % _RE_IDENT)
SYNT_RE_TYPE = re.compile(
    '(?:^|\s{1})(%s)(?:$|[^0-9^a-z^A-Z^\-]{1,})' % _RE_NATIVE_TYPES)
SYNT_RE_TYPEREF = re.compile(
    '(?:^|\s{1})(%s)' % _RE_TYPEREF)
SYNT_RE_CLASSREF = re.compile(
    '(?:^|\s{1})(%s)' % _RE_CLASSREF)
SYNT_RE_CLASSFIELDIDENT = re.compile(
    '(?:^|\s{1})\&([a-zA-Z0-9\-]{1,})')
SYNT_RE_CLASSFIELDREF = re.compile(
    '(?:^|\s{1})((%s)\s{0,1}\.\&([a-zA-Z0-9\-]{1,}))' % _RE_CLASSREF)
SYNT_RE_CLASSFIELDREFINT = re.compile(
    '(?:^|\s{1})\&(%s)' % _RE_TYPEREF)
SYNT_RE_CLASSVALREF = re.compile(
    '(?:^|\s{1})((%s)\s{0,1}\.\&([a-zA-Z0-9\-]{1,}))' % _RE_IDENT)
SYNT_RE_CLASSINSTFIELDREF = re.compile(
    '(?:^|\s{1})(%s)(?:\s{0,1}\.\&(%s)){0,}' % (_RE_WORD, _RE_WORD))
SYNT_RE_IDENTEXT = re.compile(
    '(?:^|\s{1})((%s)\.(%s))' % (_RE_TYPEREF, _RE_IDENT))
# WNG: SYNT_RE_TYPEREF matches also SYNT_RE_CLASSREF

# ASN.1 expressions
SYNT_RE_MODULEDEF = re.compile(
    '\s{1,}(DEFINITIONS)\s{1,}')
SYNT_RE_MODULEREF = re.compile(
    '(?:^|\s{1})(%s){1}\s{0,}(\{[\s\-a-zA-Z0-9\(\)]{1,}\}){0,1}' % _RE_TYPEREF)

SYNT_RE_MODULEFROM = re.compile(
    '(?:FROM\s{1,})(%s)\s*' % _RE_TYPEREF)
SYNT_RE_MODULEFROM_SYM = re.compile(
    '(%s)(?:\s*\{\s*\}){0,1}(?:\s*,|\s{1,}FROM)' % _RE_WORD)
SYNT_RE_MODULEFROM_OID = re.compile(
    '(%s)\s*|(\{[a-zA-Z0-9\(\)\-\s]{4,}\})\s*' % _RE_IDENT)
SYNT_RE_MODULEFROM_WIT = re.compile(
    'WITH\s{1,}(SUCCESSORS|DESCENDANTS)\s*')

SYNT_RE_MODULEEXP = re.compile(
    '(?:^|\s{1})EXPORTS((.|\n)*?);')
SYNT_RE_MODULEIMP = re.compile(
    '(?:^|\s{1})IMPORTS((.|\n)*?);')
SYNT_RE_MODULEOPT = re.compile(
    '(?:^|\s{1})(EXPLICIT\s{1,}TAGS|IMPLICIT\s{1,}TAGS|AUTOMATIC\s{1,}TAGS)')
SYNT_RE_MODULEEXT = re.compile(
    '(?:^|\s{1})(EXTENSIBILITY\s{1,}IMPLIED)')
SYNT_RE_TAG = re.compile(
    '\[\s{0,}(UNIVERSAL|APPLICATION|PRIVATE){0,1}\s{0,}(?:(%s)|(%s))\s{0,}\]' \
    % (_RE_INTEGER_POS, _RE_IDENT))
SYNT_RE_PARAM = re.compile(
    '(%s)(?:\s{0,}\:\s{0,}(%s|%s)){0,1}' \
    % (_RE_TYPEREF, _RE_IDENT, _RE_TYPEREF))
SYNT_RE_SIZEOF = re.compile(
    '(\({0,1}\s{0,}SIZE)|(OF)')
SYNT_RE_INT_ID = re.compile(
    '(%s)\s{0,}\(\s{0,}((%s)|(%s))\s{0,}\)' \
    % (_RE_IDENT, _RE_INTEGER, _RE_IDENT))
SYNT_RE_ENUM = re.compile(
    '(%s|\.{3})\s{0,}(?:\(\s{0,}((%s)|(%s))\s{0,}\)){0,1}' \
    % (_RE_IDENT, _RE_INTEGER, _RE_IDENT))
SYNT_RE_OID_COMP = re.compile(
    '(%s)|((%s)\s{0,}(?:\((%s)\)){0,1})' \
    % (_RE_INTEGER_POS, _RE_IDENT, _RE_INTEGER_POS))
SYNT_RE_CLASSSYNTAX = re.compile(
    '(?:^|\s{1})((\[)|(\])|([A-Z\-]{1,})|(\&([a-zA-Z0-9\-]{1,})))')
SYNT_RE_CHOICEALT = re.compile(
    '(?:^|\s{1})(?:(%s)(?:\s{0,}<\s{0,})){1,}(%s)' % (_RE_IDENT, _RE_TYPEREF))
SYNT_RE_INTVAL = re.compile(
    '(?:^|\s{1})(\-{0,1}[0-9]{1,})')
SYNT_RE_BSTRING = re.compile(
    '(?:^|\s{1})\'([\s01]{0,})\'B')
SYNT_RE_HSTRING = re.compile(
    '(?:^|\s{1})\'([\s0-9A-F]{0,})\'H')
SYNT_RE_REALNUM = re.compile(
    '(?:^|\s{1})' \
    '(\-{0,1}[0-9]{1,}){1}' \
    '(?:\.([0-9]{0,})){0,1}' \
    '(?:[eE](\-{0,1}[0-9]{1,})){0,1}')
SYNT_RE_REALSEQ = re.compile(
    '(?:^|\s{1})' \
    '(?:\{\s{0,}mantissa\s{1,}(\-{0,1}[0-9]{1,})\s{0,},' \
    '\s{0,}base\s{1,}(2|10)\s{0,},' \
    '\s{0,}exponent\s{1,}(\-{0,1}[0-9]{1,})\s{0,}\})')
SYNT_RE_REALSPEC = re.compile(
    '(?:^|\s{1})((?:PLUS\-INFINITY)|(?:MINUS\-INFINITY)|(?:NOT-A-NUMBER))')
SYNT_RE_UNIVSTR = re.compile(
    '(?:^|\s{1})(?:\{\s{0,}'\
    '([0-9]{1,3})\s{0,},\s{0,}([0-9]{1,3})\s{0,},\s{0,}'\
    '([0-9]{1,3})\s{0,},\s{0,}([0-9]{1,3})\s{0,}\})')
SYNT_RE_TIMEUTC = re.compile(
    '(?:^|\s{1})' \
    '"([0-9]{2})([0-9]{2})([0-9]{2})' \
    '([0-9]{2})([0-9]{2})([0-9]{2}){0,1}' \
    '((?:Z)|(?:[+-]{1}[0-9]{4}))"')
SYNT_RE_TIMEGENE = re.compile(
    '(?:^|\s{1})' \
    '"([0-9]{4})([0-9]{2})([0-9]{2})([0-9]{2})' \
    '(?:([0-9]{2})([0-9]{2}){0,1}){0,1}' \
    '(?:(?:\.|,)([0-9]{1,})){0,1}' \
    '((?:Z)|(?:[+-](?:[0-9]{2}){0,2})){0,1}"')
SYNT_RE_CONST_DISPATCH = re.compile(
    '(?:^|\s{1})(INCLUDES)|(SIZE)|(FROM)|(WITH COMPONENTS)|(WITH COMPONENT)|' \
    '(PATTERN)|(SETTINGS)|(CONTAINING)|(ENCODED BY)|(CONSTRAINED BY)')
SYNT_RE_CONST_EXT = re.compile(
    ',\s{0,}\.\.\.')
SYNT_RE_GROUPVERS = re.compile(
    '(?:^|\s{1})[0-9]{1,}\s{0,1}\:')

def match_typeref(text):
    m = SYNT_RE_TYPEREF.match(text)
    if not m:
        return None
    else:
        # ensure the match does not correspond to an ASN.1 keyword
        if m.group() in SYNT_KEYWORDS:
            return None
        else:
            return m

# ------------------------------------------------------------------------------#
# text processing routines
# ------------------------------------------------------------------------------#


def strip(text=''):
    return text.strip()


def name_to_defin(n):
    if iskeyword(n):
        # n is a Python keyword
        n += '_'
    return n.replace('-', '_').replace(' ', '_')


def scan_for_comments(text=''):
    """
    returns a list of 2-tuple (start offset, end offset) for each ASN.1 comment
    found in text
    """
    ret = []
    cur = 0
    next = text.find('--')
    while next >= 0:
        cur += next
        # start of comment
        start = cur
        # move cursor forward to reach the end of comment
        cur += 2
        # exception for line full of ------------------ sh*t
        while text[cur:1+cur] == '-':
            cur += 1
        while True:
            # move 1 by 1
            if text[cur:1+cur] == '\n' or cur >= len(text):
                # end-of-line or end-of-file
                ret.append((start, cur))
                cur += 1
                break
            elif text[cur:2+cur] == '--':
                # end-of-comment
                cur += 2
                ret.append((start, cur))
                break
            else:
                cur += 1
        # find the next comment
        next = text[cur:].find('--')
    return ret


def scan_for_comments_cstyle(text=''):
    """
    returns a list of 2-tuple (start offset, end offset) for each ASN.1 comment
    in C-style found in text
    """
    ret = []
    cur = 0
    next = text.find('/*')
    while next >= 0:
        cur += next
        # start of comment
        start = cur
        # move cursor forward to reach the end of comment
        cur += 2
        while True:
            # move 1 by 1 and find an end-of-comment or end-of-file
            if cur >= len(text):
                # end-of-file
                ret.append((start, cur))
                break
            elif text[cur:2+cur] == '*/':
                # end-of-comment
                cur += 2
                ret.append((start, cur))
                break
            else:
                cur += 1
        # find the next comment
        next = text[cur:].find('/*')
    return ret


def clean_text(text=''):
    """
    processes text to:
        remove ASN.1 comments
        replace tab with space
        remove duplicated spaces
    """
    # WARNING: this routine for text cleanup, as it is applied early in the text
    # processing, may mess up ASN.1 string values
    #
    # remove comments
    comments = scan_for_comments(text)
    if comments:
        # get the complementary text to comments, to get the text containing
        # the actual definitions
        start, defins = 0, []
        for (so, eo) in comments:
            defins.append( text[start:so] )
            start = eo
        defins.append( text[start:len(text)] )
        text = ''.join(defins)
    #
    # remove C-style comments
    comments = scan_for_comments_cstyle(text)
    if comments:
        start, defins = 0, []
        for (so, eo) in comments:
            defins.append( text[start:so] )
            start = eo
        defins.append( text[start:len(text)] )
        text = ''.join(defins)
    #
    # replace tab with space
    text = text.replace('\t', ' ')
    # remove duplicated CR
    text = re.sub('\n{2,}', '\n', text)
    # remove duplicated spaces
    text = re.sub(' {2,}', ' ', text)
    #
    return text


def search_top_lvl_sep(text='', sep=','):
    """
    returns a list of offsets for each top-level separator `sep' found in the text
    """
    ret = []
    #
    count = {'(': 0, ')': 0, '{': 0, '}': 0, '[': 0, ']': 0}
    _is_top_level = lambda c: c['('] == c[')'] and c['{'] == c['}'] and c['['] == c[']']
    #
    for cur in range(len(text)):
        if text[cur] in count:
            count[text[cur]] += 1
        if text[cur] == sep and _is_top_level(count):
            ret.append(cur)
    return ret


def search_top_lvl_off(text=''):
    """
    returns the offsets in the text corresponding to the top level
    (outside of any parenthesis / bracket / curlybracket groups)
    """
    # {1, 2, {3, True}} DEFAULT (1, 2) UNIQUE
    off = [[0]]
    #
    count = {'(': 0, ')': 0, '{': 0, '}': 0, '[': 0, ']': 0}
    _is_top_level = lambda c: c['('] == c[')'] and c['{'] == c['}'] and c['['] == c[']']
    #
    top_level = True
    for cur in range(len(text)):
        char = text[cur]
        if char in count:
            count[char] += 1
        if top_level and not _is_top_level(count):
            # transition to inner group: closing the top-level boundary
            off[-1].append(cur)
            top_level = False
        elif not top_level and _is_top_level(count):
            # transition to top level: opening a top-level boundary
            off.append([cur + 1])
            top_level = True
    # end of text
    if top_level:
        off[-1].append(len(text) + 1)
    else:
        # error ?
        del off[-1]
    # some clean-up
    if off[0] == [0, 0]:
        del off[0]
    return off


def search_between(text='', ins='{', outs='}'):
    """
    returns a list of 2-tuple for each top level part of the text in-between
    `ins' and `outs' expression
    """
    # TODO: look for character string, defined between double-quotes ",
    # and do not evaluate matching character inside them
    #
    if len(ins) != len(outs):
        raise(ASN1Err('requires identical length for ins and outs'))
    lens = len(ins)
    #
    ret = []
    #
    count = {ins: 0, outs: 0}
    entered = False
    #
    for cur in range(len(text)):
        if not entered and text[cur:cur + lens] == ins:
            # passing initial ins char
            entered = True
            start = cur
        if text[cur:cur + lens] in count:
            # counting ins / outs chars
            count[text[cur:cur + lens]] += 1
        if entered and count[ins] == count[outs]:
            # passing last outs char
            stop = cur + lens
            ret.append((start, stop))
            entered = False
    return ret


def extract_curlybrack(text=''):
    """
    extracts the part of text between "{" and "}" if the "{" is at the start
    of the string
    returns the remaining text, and the extracted content or None
    """
    text = text.strip()
    offsets = search_between(text, '{', '}')
    if not offsets:
        return text, None
    offsets = offsets[0]
    if offsets[0] != 0:
        return text, None
    return text[offsets[1]:].strip(), text[1:offsets[1] - 1].strip()


def extract_parenth(text=''):
    """
    extracts the part of text between "(" and ")" if the "(" is at the start
    of the string
    returns the remaining text, and the extracted content or None
    """
    text = text.strip()
    offsets = search_between(text, '(', ')')
    if not offsets:
        return text, None
    offsets = offsets[0]
    if offsets[0] != 0:
        return text, None
    return text[offsets[1]:].strip(), text[1:offsets[1] - 1].strip()


def extract_brack(text=''):
    """
    extracts the part of text between "[" and "]" if the "[" is at the start
    of the string
    returns the remaining text, and the extracted content or None
    """
    text = text.strip()
    offsets = search_between(text, '[', ']')
    if not offsets:
        return text, None
    offsets = offsets[0]
    if offsets[0] != 0:
        return text, None
    return text[offsets[1]:].strip(), text[1:offsets[1] - 1].strip()


def extract_doublebrack(text=''):
    """
    extracts the part of text between "[[" and "]]" if the "[[" is at the start
    of the string
    returns the remaining text, and the extracted content or None
    """
    text = text.strip()
    offsets = search_between(text, '[[', ']]')
    if not offsets:
        return text, None
    offsets = offsets[0]
    if offsets[0] != 0:
        return text, None
    return text[offsets[1]:].strip(), text[2:offsets[1] - 2].strip()


def extract_charstr(text=''):
    """
    extracts the part of text between double-quote ", escaping doubled
    double-quotes, and removing newline groups
    returns the remaining text, and the extracted content or None
    """
    text = text.strip()
    if text[0:1] != '"':
        return text, None
    elif len(text) == 1:
        return text, None
    #
    esc = 0
    for cur in range(1, len(text)):
        # 1) end of text
        if cur == len(text) - 1:
            if text[cur:1+cur] != '"':
                # no end-of-charstr found
                return text, None
            else:
                return '', re.subn('\s{0,}\n\s{0,}', '', text[1:-1])[0]

        # 2) finding a double-quote
        if text[cur:1+cur] == '"':
            if esc > 0:
                # 2.1) escape cursor already set
                if cur == esc:
                    # current double-quote escaped, unsetting escape cursor
                    esc = 0
                else:
                    # current double-quote not escaped
                    if text[1+cur:2+cur] == '"':
                        # escaping next char
                        esc = 1+cur
                    else:
                        # end of charstr
                        return text[1+cur:].strip(), \
                               re.subn('\s{0,}\n\s{0,}', '', text[1:cur])[0]
            else:
                # 2.2) escape cursor not set
                if text[1+cur:2+cur] == '"':
                    # escaping next char
                    esc = 1+cur
                else:
                    # end of charstr
                    return text[1+cur:].strip(), \
                           re.subn('\s{0,}\n\s{0,}', '', text[1:cur])[0]


def extract_multi(text=''):
    """
    extracts the list of textual components between curly-brackets
    returns the remaining text, and the list of extracted textual components
    """
    # e.g. { comp1, comp2, comp3 }
    rest, text = extract_curlybrack(text)
    if not text:
        return rest, text
    else:
        # split each coma-separated field
        coma_offsets = [-1] + search_top_lvl_sep(text, ',') + [len(text)]
        return rest, list(map(strip,
                              [text[coma_offsets[i] + 1:coma_offsets[i + 1]] \
                               for i in range(len(coma_offsets) - 1)]))


def extract_set(text=''):
    """
    extracts the list of root and extended textual components,
    each component being separated with "|",
    and root and extension being separated with commas and "..."
    taking care of character strings definition between double-quotes "

    returns a dict with root and ext keys and corresponding strings
    """
    # 1) we go char by char with a state machine, looking for:
    # 1) unescaped double-quote "
    # 2) or separator |
    # 3) coma ,
    #
    text = text.strip()
    #
    # list the set of group of values
    # the current group of values
    # the current list of chars
    valset = []
    valgrp = []
    value  = []
    #
    # state that says if we are in a charstr, between " or not
    # we do not evaluate escaped double-quotes especially,
    # as it is like we are leaving and reentering the charstr state
    charstr = False
    #
    # state that says if we are inside any inner set inside the given set,
    # between { and } or not
    innerset = 0
    #
    # go char by char
    for char in text:
        value.append(char)
        if char == '"':
            if charstr:
                charstr = False
            else:
                charstr = True
        else:
            if not charstr:
                if char == '{':
                    innerset += 1
                elif char == '}':
                    innerset -= 1
                    if innerset < 0:
                        raise(ASN1Err('extract_set, invalid number of closing curlybrackets'\
                              .format(text)))
                if innerset == 0:
                    if char == '|':
                        valgrp.append( ''.join(value[:-1]).strip() )
                        value = []
                    elif char == ',':
                        valgrp.append( ''.join(value[:-1]).strip() )
                        value = []
                        valset.append( valgrp )
                        valgrp = []
    if value:
        valgrp.append( ''.join(value).strip() )
    if valgrp:
        valset.append( valgrp )
    #
    # 2) we evaluate the list of groups found and the potential extensibility
    # marker in between, and build the resulting root / ext dict
    #
    if len(valset) == 0:
        return {'root': [], 'ext': None}
    elif len(valset) == 1:
        if valset[0] == ['...']:
            return {'root': [], 'ext': []}
        else:
            return {'root': valset[0], 'ext': None}
    elif len(valset) == 2:
        if valset[0] == ['...']:
            return {'root': [], 'ext': valset[1]}
        else:
            if valset[1] != ['...']:
                raise(ASN1Err('extract_set, invalid coma-separated groups, {0!r}'\
                      .format(valset)))
            return {'root': valset[0], 'ext': []}
    elif len(valset) == 3:
        if valset[1] != ['...']:
            raise(ASN1Err('extract_set, invalid coma-separated groups, {0!r}'\
                  .format(valset)))
        return {'root': valset[0], 'ext': valset[2]}
    else:
        raise(ASN1Err('extract_set, invalid coma-separated groups, {0!r}'\
              .format(valset)))


def extract_from_import(text=''):
    """
    extracts the module name, reference and / or OID set after a FROM import
    statement, test `text` argument must start with the FROM keyword

    returns a 2-tuple with
        integer: length of the text containing the whole FROM statement
        dict: with "name", "oid", "oidref" and "with" keys
    """
    m = SYNT_RE_MODULEFROM.match(text)
    assert(m)
    cur = m.end()
    ret = {'name': m.group(1), 'oid': None, 'oidref': None, 'with': None}
    # check if we stop or continue with an OID value or OID reference
    if SYNT_RE_MODULEFROM_SYM.match(text[cur:]) or not text[cur:]:
        return cur, ret
    m = SYNT_RE_MODULEFROM_OID.match(text[cur:])
    assert(m)
    cur += m.end()
    assert(None in m.groups())
    if m.group(1):
        ret['oidref'] = m.group(1)
    else:
        ret['oid'] = m.group(2)
    # check if there is a final WITH stmt
    m = SYNT_RE_MODULEFROM_WIT.match(text[cur:])
    if m:
        ret['with'] = m.group(1)
        cur += m.end()
    # final control
    assert(SYNT_RE_MODULEFROM_SYM.match(text[cur:]) or not text[cur:])
    return cur, ret