make-manuf.py: Handle more business types and other fixes.
Move our business types and general terms to a list and add more. Only convert all upper case names to title case. Remove double quotes when shortening names. Change-Id: I31e9799986542270350b8c2436929f293de4e36c Reviewed-on: https://code.wireshark.org/review/35577 Reviewed-by: Gerald Combs <gerald@wireshark.org>
This commit is contained in:
parent
96965c6b4a
commit
f8808b8bd0
|
@ -1,5 +1,4 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Wireshark - Network traffic analyzer
|
||||
# By Gerald Combs <gerald@wireshark.org>
|
||||
|
@ -58,6 +57,49 @@ def open_url(url):
|
|||
|
||||
return (body, dict(response.info()))
|
||||
|
||||
# These are applied after punctuation has been removed.
|
||||
# More examples at https://en.wikipedia.org/wiki/Incorporation_(business)
|
||||
general_terms = '|'.join([
|
||||
'a/s',
|
||||
'ab', # Also follows "Oy", which is covered below.
|
||||
'ag',
|
||||
'b ?v',
|
||||
'closed joint stock company',
|
||||
'co',
|
||||
'company',
|
||||
'corp',
|
||||
'corporation',
|
||||
'de c ?v', # Follows "S.A.", which is covered separately below.
|
||||
'gmbh',
|
||||
'holding',
|
||||
'inc',
|
||||
'incorporated',
|
||||
'jsc',
|
||||
'kg',
|
||||
'k k', # "K.K." as in "kabushiki kaisha", but not "K+K" as in "K+K Messtechnik".
|
||||
'limited',
|
||||
'llc',
|
||||
'ltd',
|
||||
'n ?v',
|
||||
'oao',
|
||||
'open joint stock company',
|
||||
'ooo',
|
||||
'oy',
|
||||
'oyj',
|
||||
'plc',
|
||||
'pty',
|
||||
'pvt',
|
||||
's ?a ?r ?l',
|
||||
's ?a',
|
||||
's ?p ?a',
|
||||
'sp ?k',
|
||||
's ?r ?l',
|
||||
'systems',
|
||||
'the',
|
||||
'zao',
|
||||
'z ?o ?o'
|
||||
])
|
||||
|
||||
def shorten(manuf):
|
||||
'''Convert a long manufacturer name to abbreviated and short names'''
|
||||
# Normalize whitespace.
|
||||
|
@ -66,15 +108,16 @@ def shorten(manuf):
|
|||
# Add exactly one space on each end.
|
||||
# XXX This appears to be for the re.sub below.
|
||||
manuf = u' {} '.format(manuf)
|
||||
# Convert to consistent case
|
||||
manuf = manuf.title()
|
||||
# Convert all caps to title case
|
||||
if manuf.isupper():
|
||||
manuf = manuf.title()
|
||||
# Remove any punctuation
|
||||
# XXX Use string.punctuation? Note that it includes '-' and '*'.
|
||||
manuf = re.sub(u"[',.()]", ' ', manuf)
|
||||
manuf = re.sub(u"[\"',.()]", ' ', manuf)
|
||||
# & isn't needed when Standalone
|
||||
manuf = manuf.replace(" & ", " ")
|
||||
# Remove any "the", "inc", "plc" ...
|
||||
manuf = re.sub('\W(the|incorporated|inc|plc|systems|corporation|corp|s/a|a/s|ab|ag|kg|gmbh|company|co|limited|ltd|holding|spa)(?= )', '', manuf, flags=re.IGNORECASE)
|
||||
# Remove business types and other general terms ("the", "inc", "plc", etc.)
|
||||
manuf = re.sub('\W(' + general_terms + ')(?= )', '', manuf, flags=re.IGNORECASE)
|
||||
# Remove all spaces
|
||||
manuf = re.sub('\s+', '', manuf)
|
||||
|
||||
|
|
Loading…
Reference in New Issue