From f6fceb8684c6aaccdf2d0661663866a52bb0cc66 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Wed, 27 Dec 2023 21:44:56 +0100 Subject: [PATCH] Implement convoluted encoding of UCS-2 as per TS 102 221 Annex A TS 102 221 Annex A defines three variants of encoding UCS-2 characters into byte streams in files on UICC cards: One rather simplistic one, and two variants for optimizing memory utilization on the card. Let's impelement a construct "Ucs2Adapter" class for this. Change-Id: Ic8bc8f71079faec1bf0e538dc0dfa21403869c6d --- pySim/construct.py | 158 ++++++++++++++++++++++++++++++++++++++++ tests/test_construct.py | 44 +++++++++++ 2 files changed, 202 insertions(+) diff --git a/pySim/construct.py b/pySim/construct.py index f78adfe8..778a878a 100644 --- a/pySim/construct.py +++ b/pySim/construct.py @@ -48,6 +48,164 @@ class Utf8Adapter(Adapter): def _encode(self, obj, context, path): return codecs.encode(obj, "utf-8") +class Ucs2Adapter(Adapter): + """convert a bytes() type that contains UCS2 encoded characters encoded as defined in TS 102 221 + Annex A to normal python string representation (and back).""" + def _decode(self, obj, context, path): + # In case the string contains only 0xff bytes we interpret it as an empty string + if obj == b'\xff' * len(obj): + return "" + if obj[0] == 0x80: + # TS 102 221 Annex A Variant 1 + return codecs.decode(obj[1:], 'utf_16_be') + elif obj[0] == 0x81: + # TS 102 221 Annex A Variant 2 + out = "" + # second byte contains a value indicating the number of characters + num_of_chars = obj[1] + # the third byte contains an 8 bit number which defines bits 15 to 8 of a 16 bit base + # pointer, where bit 16 is set to zero, and bits 7 to 1 are also set to zero. These + # sixteen bits constitute a base pointer to a "half-page" in the UCS2 code space + base_ptr = obj[2] << 7 + for ch in obj[3:3+num_of_chars]: + # if bit 8 of the byte is set to zero, the remaining 7 bits of the byte contain a + # GSM Default Alphabet character, whereas if bit 8 of the byte is set to one, then + # the remaining seven bits are an offset value added to the 16 bit base pointer + # defined earlier, and the resultant 16 bit value is a UCS2 code point + if ch & 0x80: + codepoint = (ch & 0x7f) + base_ptr + out += codecs.decode(codepoint.to_bytes(2, byteorder='big'), 'utf_16_be') + else: + out += codecs.decode(bytes([ch]), 'gsm03.38') + return out + elif obj[0] == 0x82: + # TS 102 221 Annex A Variant 3 + out = "" + # second byte contains a value indicating the number of characters + num_of_chars = obj[1] + # third and fourth bytes contain a 16 bit number which defines the complete 16 bit base + # pointer to a half-page in the UCS2 code space, for use with some or all of the + # remaining bytes in the string + base_ptr = obj[2] << 8 | obj[3] + for ch in obj[4:4+num_of_chars]: + # if bit 8 of the byte is set to zero, the remaining 7 bits of the byte contain a + # GSM Default Alphabet character, whereas if bit 8 of the byte is set to one, the + # remaining seven bits are an offset value added to the base pointer defined in + # bytes three and four, and the resultant 16 bit value is a UCS2 code point, else: # + # GSM default alphabet + if ch & 0x80: + codepoint = (ch & 0x7f) + base_ptr + out += codecs.decode(codepoint.to_bytes(2, byteorder='big'), 'utf_16_be') + else: + out += codecs.decode(bytes([ch]), 'gsm03.38') + return out + else: + raise ValueError('First byte of TS 102 221 UCS-2 must be 0x80, 0x81 or 0x82') + + def _encode(self, obj, context, path): + def encodable_in_gsm338(instr: str) -> bool: + """Determine if given input string is encode-ale in gsm03.38.""" + try: + # TODO: figure out if/how we can constrain to default alphabet. The gsm0338 + # library seems to include the spanish lock/shift table + codecs.encode(instr, 'gsm03.38') + except ValueError: + return False + return True + + def codepoints_not_in_gsm338(instr: str) -> typing.List[int]: + """Return an integer list of UCS2 codepoints for all characters of 'inster' + which are not representable in the GSM 03.38 default alphabet.""" + codepoint_list = [] + for c in instr: + if encodable_in_gsm338(c): + continue + c_codepoint = int.from_bytes(codecs.encode(c, 'utf_16_be'), byteorder='big') + codepoint_list.append(c_codepoint) + return codepoint_list + + def diff_between_min_and_max_of_list(inlst: typing.List) -> int: + return max(inlst) - min(inlst) + + def encodable_in_variant2(instr: str) -> bool: + codepoint_prefix = None + for c in instr: + if encodable_in_gsm338(c): + continue + c_codepoint = int.from_bytes(codecs.encode(c, 'utf_16_be'), byteorder='big') + if c_codepoint >= 0x8000: + return False + c_prefix = c_codepoint >> 7 + if codepoint_prefix is None: + codepoint_prefix = c_prefix + else: + if c_prefix != codepoint_prefix: + return False + return True + + def encodable_in_variant3(instr: str) -> bool: + codepoint_list = codepoints_not_in_gsm338(instr) + # compute delta between max and min; check if it's encodable in 7 bits + if diff_between_min_and_max_of_list(codepoint_list) >= 0x80: + return False + return True + + def _encode_variant1(instr: str) -> bytes: + """Encode according to TS 102 221 Annex A Variant 1""" + return b'\x80' + codecs.encode(obj, 'utf_16_be') + + def _encode_variant2(instr: str) -> bytes: + """Encode according to TS 102 221 Annex A Variant 2""" + codepoint_prefix = None + # second byte contains a value indicating the number of characters + hdr = b'\x81' + len(instr).to_bytes(1, byteorder='big') + chars = b'' + for c in instr: + try: + enc = codecs.encode(c, 'gsm03.38') + except ValueError: + c_codepoint = int.from_bytes(codecs.encode(c, 'utf_16_be'), byteorder='big') + c_prefix = c_codepoint >> 7 + if codepoint_prefix is None: + codepoint_prefix = c_prefix + assert codepoint_prefix == c_prefix + enc = (0x80 + (c_codepoint & 0x7f)).to_bytes(1, byteorder='big') + chars += enc + if codepoint_prefix == None: + codepoint_prefix = 0 + return hdr + codepoint_prefix.to_bytes(1, byteorder='big') + chars + + def _encode_variant3(instr: str) -> bytes: + """Encode according to TS 102 221 Annex A Variant 3""" + # second byte contains a value indicating the number of characters + hdr = b'\x82' + len(instr).to_bytes(1, byteorder='big') + chars = b'' + codepoint_list = codepoints_not_in_gsm338(instr) + codepoint_base = min(codepoint_list) + for c in instr: + try: + # if bit 8 of the byte is set to zero, the remaining 7 bits of the byte contain a GSM + # Default # Alphabet character + enc = codecs.encode(c, 'gsm03.38') + except ValueError: + # if bit 8 of the byte is set to one, the remaining seven bits are an offset + # value added to the base pointer defined in bytes three and four, and the + # resultant 16 bit value is a UCS2 code point + c_codepoint = int.from_bytes(codecs.encode(c, 'utf_16_be'), byteorder='big') + c_codepoint_delta = c_codepoint - codepoint_base + assert c_codepoint_delta < 0x80 + enc = (0x80 + c_codepoint_delta).to_bytes(1, byteorder='big') + chars += enc + # third and fourth bytes contain a 16 bit number which defines the complete 16 bit base + # pointer to a half-page in the UCS2 code space + return hdr + codepoint_base.to_bytes(2, byteorder='big') + chars + + if encodable_in_variant2(obj): + return _encode_variant2(obj) + elif encodable_in_variant3(obj): + return _encode_variant3(obj) + else: + return _encode_variant1(obj) class BcdAdapter(Adapter): """convert a bytes() type to a string of BCD nibbles.""" diff --git a/tests/test_construct.py b/tests/test_construct.py index f1bee5a4..11822a8a 100644 --- a/tests/test_construct.py +++ b/tests/test_construct.py @@ -33,5 +33,49 @@ class TestUtils(unittest.TestCase): self.assertEqual(filter_dict(inp), out) +class TestUcs2Adapter(unittest.TestCase): + # the three examples from TS 102 221 Annex A + EXAMPLE1 = b'\x80\x00\x30\x00\x31\x00\x32\x00\x33' + EXAMPLE2 = b'\x81\x05\x13\x53\x95\xa6\xa6\xff\xff' + EXAMPLE3 = b'\x82\x05\x05\x30\x2d\x82\xd3\x2d\x31' + ad = Ucs2Adapter(GreedyBytes) + + def test_example1_decode(self): + dec = self.ad._decode(self.EXAMPLE1, None, None) + self.assertEqual(dec, "0123") + + def test_example2_decode(self): + dec = self.ad._decode(self.EXAMPLE2, None, None) + self.assertEqual(dec, "S\u0995\u09a6\u09a6\u09ff") + + def test_example3_decode(self): + dec = self.ad._decode(self.EXAMPLE3, None, None) + self.assertEqual(dec, "-\u0532\u0583-1") + + testdata = [ + # variant 2 with only GSM alphabet characters + ( "mahlzeit", '8108006d61686c7a656974' ), + # variant 2 with mixed GSM alphabet + UCS2 + ( "mahlzeit\u099523", '810b136d61686c7a656974953233' ), + # variant 3 due to codepoint exceeding 8 bit + ( "mahl\u8023zeit", '820980236d61686c807a656974' ), + # variant 1 as there is no common codepoint pointer / prefix + ( "\u3000\u2000\u1000", '80300020001000' ), + ] + + def test_data_decode(self): + for string, encoded_hex in self.testdata: + encoded = h2b(encoded_hex) + dec = self.ad._decode(encoded, None, None) + self.assertEqual(dec, string) + + def test_data_encode(self): + for string, encoded_hex in self.testdata: + encoded = h2b(encoded_hex) + re_enc = self.ad._encode(string, None, None) + self.assertEqual(encoded, re_enc) + + + if __name__ == "__main__": unittest.main()