make-manuf.py: Handle more business types and other fixes.

Move our business types and general terms to a list and add more. Only
convert all upper case names to title case. Remove double quotes when
shortening names.

Change-Id: I31e9799986542270350b8c2436929f293de4e36c
Reviewed-on: https://code.wireshark.org/review/35577
Reviewed-by: Gerald Combs <gerald@wireshark.org>
This commit is contained in:
Gerald Combs 2019-12-28 12:06:00 -08:00
parent 96965c6b4a
commit f8808b8bd0
1 changed files with 49 additions and 6 deletions

View File

@ -1,5 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Wireshark - Network traffic analyzer
# By Gerald Combs <gerald@wireshark.org>
@ -58,6 +57,49 @@ def open_url(url):
return (body, dict(response.info()))
# These are applied after punctuation has been removed.
# More examples at https://en.wikipedia.org/wiki/Incorporation_(business)
general_terms = '|'.join([
'a/s',
'ab', # Also follows "Oy", which is covered below.
'ag',
'b ?v',
'closed joint stock company',
'co',
'company',
'corp',
'corporation',
'de c ?v', # Follows "S.A.", which is covered separately below.
'gmbh',
'holding',
'inc',
'incorporated',
'jsc',
'kg',
'k k', # "K.K." as in "kabushiki kaisha", but not "K+K" as in "K+K Messtechnik".
'limited',
'llc',
'ltd',
'n ?v',
'oao',
'open joint stock company',
'ooo',
'oy',
'oyj',
'plc',
'pty',
'pvt',
's ?a ?r ?l',
's ?a',
's ?p ?a',
'sp ?k',
's ?r ?l',
'systems',
'the',
'zao',
'z ?o ?o'
])
def shorten(manuf):
'''Convert a long manufacturer name to abbreviated and short names'''
# Normalize whitespace.
@ -66,15 +108,16 @@ def shorten(manuf):
# Add exactly one space on each end.
# XXX This appears to be for the re.sub below.
manuf = u' {} '.format(manuf)
# Convert to consistent case
manuf = manuf.title()
# Convert all caps to title case
if manuf.isupper():
manuf = manuf.title()
# Remove any punctuation
# XXX Use string.punctuation? Note that it includes '-' and '*'.
manuf = re.sub(u"[',.()]", ' ', manuf)
manuf = re.sub(u"[\"',.()]", ' ', manuf)
# & isn't needed when Standalone
manuf = manuf.replace(" & ", " ")
# Remove any "the", "inc", "plc" ...
manuf = re.sub('\W(the|incorporated|inc|plc|systems|corporation|corp|s/a|a/s|ab|ag|kg|gmbh|company|co|limited|ltd|holding|spa)(?= )', '', manuf, flags=re.IGNORECASE)
# Remove business types and other general terms ("the", "inc", "plc", etc.)
manuf = re.sub('\W(' + general_terms + ')(?= )', '', manuf, flags=re.IGNORECASE)
# Remove all spaces
manuf = re.sub('\s+', '', manuf)