add contrib/

In libosmocore (and likely elsewhere) we have scores of packed structs with
sub-byte integer members that lack the necessary member reversal shims to be
able to work on big endian architectures.

Instead of manually editing each one of them and probably introduce errors in
the process, this script handles the change automatically, and in the future
allows us to verify correctness in gerrit verifications.

Change-Id: I8e75b17d8071c7b3a2a171ba776fb76854b28a53
This commit is contained in:
Neels Hofmeyr 2018-11-15 23:29:56 +01:00 committed by Neels Hofmeyr
parent 49c06680e7
commit 7ab5fc1f3b
1 changed files with 369 additions and 0 deletions

contrib/ Executable file
View File

@ -0,0 +1,369 @@
#!/usr/bin/env python3
'''Using mad regexes, automatically make sure that all structs with sub-byte
integers have matching big-endian definitions. The idea is to save a lot of
manual effort, and to automatically verify that there are no errors.
This script most certainly has numerous holes and shortcomings, but actually,
if you hit problems with it, rather adjust your coding style so that this
script can deal with it...'''
import re
import sys
import codecs
import os.path
re_struct_start = re.compile(r'^struct\s*[a-zA-Z_][a-zA-Z_0-9]*\s*{\s*$')
re_struct_end = re.compile(r'^}[^;]*;\s*$')
re_substruct_start = re.compile(r'^\s+struct\s*{\s*$')
re_substruct_end = re.compile(r'^\s+}\s*([^;]*\s)[a-zA-Z_][a-zA-Z_0-9]*\s*;\s*$')
re_int_def = re.compile(r'(^\s*((const|unsigned|signed|char|int|long|int[0-9]+_t|uint[0-9]_t)\s+)+\s*)([^;]*;)',
re_int_members = re.compile(r'([a-zA-Z_][a-zA-Z_0-9]*|[a-zA-Z_][a-zA-Z_0-9]*\s*:\s*[0-9]+)\s*[,;]\s*', re.DOTALL | re.MULTILINE)
re_little_endian_ifdef = re.compile(r'#\s*(if|elif)\s+OSMO_IS_LITTLE_ENDIAN\s*(==\s*1\s*|)');
re_big_endian_ifdef = re.compile(r'#\s*(if|elif)\s+OSMO_IS_BIG_ENDIAN\s*');
re_else = re.compile(r'#\s*else\s*');
re_endif = re.compile(r'#\s*endif\s*');
re_c_comment = re.compile(r'(/\*[^*]+\*/|//.?$)')
def remove_c_comments(code_str):
return ''.join(re_c_comment.split(code_str)[::2])
def section_struct_body(struct_body_lines):
'''divide a top-level-struct body into sections of
['arbitrary string', ['body;\n', 'lines;\n'], 'arbitrary string', ...]
Aim: handle each sub-struct on its own, and if there already are ifdefs for
little and big endian, keep just the little endian bit and derive big
endian from it.
An arbitrary string is anything other than struct member definitions, like
a 'struct {', '} sub_name;', ...
"body lines" are lines that define struct members (possibly with comments).
Return: list of alternate arbitrary strings and variable definitions.
# these globals are needed so that end_def() can change them from inside
# the function. Not very nice style, but easiest implementation.
global struct_body_parts
global arbitrary_part
global def_part
struct_body_parts = []
arbitrary_part = []
def_part = []
def end_def():
'''if there is any content, flush out recorded parts (def_part,
arbitrary_part) and start a new part. In short, cut a section
global struct_body_parts
global arbitrary_part
global def_part
if def_part:
arbitrary_part = []
def_part = []
j = 0
while j < len(struct_body_lines):
line = struct_body_lines[j]
if (re_substruct_start.fullmatch(line)
or re_substruct_end.fullmatch(line)):
j += 1
if re_big_endian_ifdef.fullmatch(line):
# discard big endian section
j += 1
while j < len(struct_body_lines):
line = struct_body_lines[j]
if re_endif.fullmatch(line):
j += 1
if re_little_endian_ifdef.fullmatch(line):
# keep that start of little endian section, not j++
if re_else.fullmatch(line):
# there's an '#else' after big-endian. Shim a little-endian header in just for the loop.
struct_body_lines[j] = '#if OSMO_IS_LITTLE_ENDIAN\n'
j += 1
if re_little_endian_ifdef.fullmatch(line):
j += 1
while j < len(struct_body_lines):
line = struct_body_lines[j]
if re_endif.fullmatch(line):
j += 1
if re_big_endian_ifdef.fullmatch(line):
# keep that start of big endian section, not j++
if re_else.fullmatch(line):
# there's an '#else' after little-endian. Shim a big-endian header in just for the loop.
struct_body_lines[j] = '#if OSMO_IS_BIG_ENDIAN\n'
j += 1
j += 1
# flush the last section remaining that didn't see an explicit end
# end_def() only flushes arbitrary_part if there was a def_part, so:
if arbitrary_part:
return struct_body_parts
def struct_body_to_big_endian(body_str):
'''Input: a multi-line string containing the body of a struct, i.e. without
sub-structs and without #if OSMO_IS_BIG_ENDIAN. like
'\tconst char *foo;\n\tuint8_t moo:3, goo:2;\n\tuint8_t loo:3;\n\tvoid *baz;\n'
Return None to indicate that there is no little/big endian split
required, or return a multi-line string of the big-endian version of this
same struct body, where sub-byte ints are reversed at byte boundaries, and
all others are copied 1:1. If there are no sub-byte integers, return None,
to indicate that there is no little/big endian split required.'''
# kick comments out of the code analysis. They will end up being stripped
# from big-endian only.
body_str = remove_c_comments(body_str)
def_strs = body_str.split(';')
def_strs = ('%s;' % def_str for def_str in def_strs if def_str.strip())
# classify defs as containing sub-byte members or not
# defs = [ (true, 'uint8_t ', ('foo:3', 'bar:5')),
# (false, 'int baz;'),...]
defs = []
any_sub_byte_ints = False
for one_def in def_strs:
# does it have sub-string integers?
int_def = re_int_def.fullmatch(one_def)
if not int_def:
# not even a number, same for big and little endian
defs.append((False, one_def))
int_type =
members_str = int_def.groups()[-1]
has_sub_byte_ints = False
members = []
for int_member in re_int_members.finditer(members_str):
member =
if ':' in member:
has_sub_byte_ints = True
if not has_sub_byte_ints:
defs.append((False, one_def))
defs.append((True, one_def, int_type, members))
any_sub_byte_ints = True
if not any_sub_byte_ints:
return None
# now the interesting part, go over the defs, and reverse the sub-byte ints
# at byte boundaries.
i = 0
got_bits = 0
byte_type = None
members_within_a_byte = []
big_endian_defs = []
big_defs = []
for classified_def in defs:
has_sub_byte_ints = classified_def[0]
# now the big endian part
if has_sub_byte_ints:
_, one_def, int_type, members = classified_def
if byte_type and byte_type.strip() != int_type.strip():
raise Exception('mismatching type continuation after incomplete byte: %r %r to %r'
% (byte_type, members_within_a_byte, int_type))
byte_type = int_type
for member in members:
member_name, bits_str = member.split(':')
member_name = member_name.strip()
bits = int(bits_str)
member = '%s:%d' % (member_name, bits)
got_bits += bits
if got_bits == 8:
# reverse these.
big_endian_defs.append('%s%s;' % (byte_type, ', '.join(reversed(members_within_a_byte))))
members_within_a_byte = []
byte_type = None
got_bits = 0
elif got_bits > 8:
raise Exception('sub-byte int breaks clean byte bounds: %s -- %d + %d = %d bits'
% (member, got_bits - bits, bits, got_bits))
elif not has_sub_byte_ints:
if got_bits:
raise Exception('sub-byte members do not add up to clean byte bounds: %r' % members_within_a_byte)
# strip empty lines
lines = [l for l in (''.join(big_endian_defs).split('\n')) if l.strip()]
# clean lines' whitespace errors we might have taken in with the type names
for i in range(len(lines)):
line = lines[i]
while len(line) and line[-1] in ' \t':
line = line[:-1]
lines[i] = line
return '\n'.join(lines)
def handle_struct_body(body_str):
big_endian_body_str = struct_body_to_big_endian(body_str)
if big_endian_body_str:
new_lines = ['#if OSMO_IS_LITTLE_ENDIAN\n']
new_lines.append('#elif OSMO_IS_BIG_ENDIAN\n'
'/* auto-generated from the little endian part above (libosmocore/contrib/ */\n')
return ''.join(new_lines)
return body_str
def _check_file(f):
if not (f.endswith('.h') or f.endswith('.c') or f.endswith('.cpp')):
# section the file into
# [ ["no struct def"], ["struct {...};"], ["no struct def"], ... ]
sections = []
in_struct = False
buf = []
for line in, "r", "utf-8").readlines():
if not in_struct and re_struct_start.fullmatch(line):
# flush whatever might still be in buf from before
# start an in_struct section
buf = [line]
in_struct = True
elif in_struct and re_struct_end.fullmatch(line):
# add this end to the in_struct section and then start a non-struct section
in_struct = False
buf = []
# flush any leftovers in buf
if buf:
# examine each struct, i.e. every second item in 'sections'
for i in range(len(sections)):
if not (i & 1):
struct = sections[i]
# If the struct isn't packed, we need not bother.
# The practical use of this: in some structs we have booleans in the
# form of
# integer flag:1;
# and these don't add up to bytes, and cause errors. So let's skip all
# non-packed structs, then all of those are out of the picture.
if not 'packed' in struct[-1]:
# assume the 'struct foo {' is on the first line, the closing brace
# '} __attribute...;' on the last, and the rest are individual
# definitions split by ';'.
struct_body_lines = struct[1:-1]
struct_body_parts = section_struct_body(struct_body_lines)
new_struct_body_parts = []
for j in range(len(struct_body_parts)):
part = ''.join(struct_body_parts[j])
if not (j & 1):
new_struct = [struct[0], ''.join(new_struct_body_parts), struct[-1]]
sections[i] = new_struct
except Exception as e:
raise Exception('ERROR in struct %r' % struct[0])
# phew. result.
result = ''.join((''.join(s) for s in sections))
# see if osmocom/core/endian.h is needed and included.
if (not f.endswith('endian.h')
and 'OSMO_IS_LITTLE_ENDIAN' in result
and '#include <osmocom/core/endian.h>' not in result):
# add the include after the last 'osmocom/core' include
last_include_start = result.rfind('#include <osmocom/core/')
if last_include_start < 0:
last_include_start = result.rfind('#include <osmocom/')
if last_include_start < 0:
last_include_start = result.rfind('#include')
if last_include_start < 0:
raise Exception('do not know where to include osmocom/core/endian.h in %r' % f)
insert_at = result.find('\n', last_include_start)
result = result[:insert_at] + '\n#include <osmocom/core/endian.h>' + result[insert_at:]
with, "w", "utf-8") as fd:
def check_file(f):
except Exception as e:
raise Exception('ERROR IN FILE %r' % f)
args = sys.argv[1:]
if not args:
args = ['.']
for f in args:
if os.path.isdir(f):
for parent_path, subdirs, files in os.walk(f, None, None):
for ff in files:
check_file(os.path.join(parent_path, ff))
# vim: tabstop=4 shiftwidth=4 expandtab