manuf: Skip some start words in short name

Skip some locations in company names that are just repeated low-value information. Many different Chinese companies will short to the same name (Shenzen for example). This is a heuristic and not 100% reliable but in the vast majority of cases it cuts down on noise and generates more informative names.
2023-07-09 01:55:26 +01:00 · 2023-07-09 01:55:26 +01:00 · f44e088329
parent ac57a25ed8
commit f44e088329
2 changed files with 2255 additions and 2238 deletions
--- a/4476
+++ b/4476
--- a/tools/make-manuf.py
+++ b/tools/make-manuf.py
@ -103,6 +103,18 @@ general_terms = '|'.join([
    'z ?o ?o'
    ])

+# Chinese company names tend to start with the location, skip it (non-exhaustive list).
+skip_start = [
+    'shengzen',
+    'shenzhen',
+    'beijing',
+    'shanghai',
+    'wuhan',
+    'hangzhou',
+    'guangxi',
+]
+
+
 def shorten(manuf):
    '''Convert a long manufacturer name to abbreviated and short names'''
    # Normalize whitespace.
@ -124,6 +136,11 @@ def shorten(manuf):
    # ...but make sure we don't remove everything.
    if not all(s == ' ' for s in plain_manuf):
        manuf = plain_manuf
+
+    split = manuf.split()
+    if len(split) > 1 and split[0].lower() in skip_start:
+        manuf = ' '.join(split[1:])
+
    # Remove all spaces
    manuf = re.sub(r'\s+', '', manuf)