wireshark/tools/check_spelling.py

394 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
# Wireshark - Network traffic analyzer
# By Gerald Combs <gerald@wireshark.org>
# Copyright 1998 Gerald Combs
#
# SPDX-License-Identifier: GPL-2.0-or-later
import os
import re
import subprocess
import argparse
import signal
from collections import Counter
# Looks for spelling errors among strings found in source or documentation files.
# TODO: deal with contractions - pyspellcheck doesn't seem to handle apostrophies..
# For text colouring/highlighting.
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
ADDED = '\033[45m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
# Try to exit soon after Ctrl-C is pressed.
should_exit = False
def signal_handler(sig, frame):
global should_exit
should_exit = True
print('You pressed Ctrl+C - exiting')
signal.signal(signal.SIGINT, signal_handler)
# Create spellchecker, and augment with some Wireshark words.
from spellchecker import SpellChecker
# Set up our dict with words from text file.
spell = SpellChecker()
spell.word_frequency.load_text_file('./tools/wireshark_words.txt')
# Track words that were not found.
missing_words = []
# Split camelCase string into separate words.
def camelCaseSplit(identifier):
matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
return [m.group(0) for m in matches]
# A File object contains all of the strings to be checked for a given file.
class File:
def __init__(self, file):
self.file = file
self.values = []
filename, extension = os.path.splitext(file)
self.code_file = extension in {'.c', '.cpp'}
with open(file, 'r') as f:
contents = f.read()
if self.code_file:
# Remove comments so as not to trip up RE.
contents = removeComments(contents)
# Find protocol name and add to dict.
# N.B. doesn't work when a variable is used instead of a literal for the protocol name...
matches = re.finditer(r'proto_register_protocol\s*\([\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\"', contents)
for m in matches:
protocol = m.group(3)
# Add to dict.
spell.word_frequency.load_words([protocol])
spell.known([protocol])
print('Protocol is: ' + bcolors.BOLD + protocol + bcolors.ENDC)
# Add a string found in this file.
def add(self, value):
self.values.append(value)
# Whole word is not recognised, but is it 2 words concatenated (without camelcase) ?
def checkMultiWords(self, word):
if len(word) < 6:
return False
# Don't consider if mixed cases.
if not (word.islower() or word.isupper()):
# But make an exception if only the fist letter is uppercase..
if not word == (word[0].upper() + word[1:]):
return False
# Try splitting into 2 words recognised at various points.
length = len(word)
for idx in range(3, length-3):
word1 = word[0:idx]
word2 = word[idx:]
if not spell.unknown([word1, word2]):
return True
return False
# Check the spelling of all the words we have found fir tgus fuke,
def spellCheck(self):
num_values = len(self.values)
this_value = 0
for v in self.values:
if should_exit:
exit(1)
this_value += 1
# Ignore includes.
if v.endswith('.h'):
continue
# Store original (as want to include for context in error report).
original = str(v)
# Replace most punctuation with spaces, and eliminate common format specifiers.
v = v.replace('.', ' ')
v = v.replace(',', ' ')
v = v.replace('`', ' ')
v = v.replace(':', ' ')
v = v.replace(';', ' ')
v = v.replace('"', ' ')
v = v.replace('\\', ' ')
v = v.replace('+', ' ')
v = v.replace('|', ' ')
v = v.replace('(', ' ')
v = v.replace(')', ' ')
v = v.replace('[', ' ')
v = v.replace(']', ' ')
v = v.replace('{', ' ')
v = v.replace('}', ' ')
v = v.replace('<', ' ')
v = v.replace('>', ' ')
v = v.replace('_', ' ')
v = v.replace('-', ' ')
v = v.replace('/', ' ')
v = v.replace('!', ' ')
v = v.replace('?', ' ')
v = v.replace('=', ' ')
v = v.replace('*', ' ')
v = v.replace('%', ' ')
v = v.replace('#', ' ')
v = v.replace('&', ' ')
v = v.replace('@', ' ')
v = v.replace('$', ' ')
v = v.replace("'", ' ')
v = v.replace('"', ' ')
v = v.replace('%u', '')
v = v.replace('%d', '')
v = v.replace('%s', '')
v = v.replace('\\n', ' ')
# Split into words.
value_words = v.split()
# Further split up any camelCase words.
words = []
for w in value_words:
words += camelCaseSplit(w)
# Check each word within this string in turn.
for word in words:
# Strip trailing digits from word.
word = word.rstrip('1234567890')
# Quote marks found in some of the docs...
word = word.replace('', '')
word = word.replace('', '')
if len(word) > 4 and spell.unknown([word]) and not self.checkMultiWords(word):
print(self.file, this_value, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC,
' -> ', '?')
# TODO: this can be interesting, but takes too long!
# bcolors.OKGREEN + spell.correction(word) + bcolors.ENDC
global missing_words
missing_words.append(word)
def removeComments(code_string):
code_string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,code_string) # C-style comment
# Remove this for now as can get tripped up if see htpps://www.... within a string!
#code_string = re.sub(re.compile("//.*?\n" ) ,"" ,code_string) # C++-style comment
return code_string
def removeSingleQuotes(code_string):
code_string = code_string.replace('\"\\\\\"', "")
code_string = code_string.replace("\\\"", " ")
code_string = code_string.replace("'\"'", "")
return code_string
def removeHexSpecifiers(code_string):
# TODO: replace with single regexp?
code_string = code_string.replace('0x%02X', "")
code_string = code_string.replace('0x%02x', "")
code_string = code_string.replace('0x%04X', "")
code_string = code_string.replace('0x%04x', "")
code_string = code_string.replace('0x%08X', "")
code_string = code_string.replace('0x%08x', "")
return code_string
# Create a File object that knows about all of the strings in the given file.
def findStrings(filename):
with open(filename, 'r') as f:
contents = f.read()
# Remove comments & embedded quotes so as not to trip up RE.
contents = removeComments(contents)
contents = removeSingleQuotes(contents)
contents = removeHexSpecifiers(contents)
# Create file object.
file = File(filename)
# What we check depends upon file type.
if file.code_file:
# Code so only checking strings.
matches = re.finditer(r'\"([^\"]*)\"', contents)
for m in matches:
file.add(m.group(1))
else:
# A documentation file, so examine all words.
words = contents.split()
for w in words:
file.add(w)
return file
# Test for whether the given file was automatically generated.
def isGeneratedFile(filename):
# Open file
f_read = open(os.path.join(filename), 'r')
lines_tested = 0
for line in f_read:
# The comment to say that its generated is near the top, so give up once
# get a few lines down.
if lines_tested > 10:
f_read.close()
return False
if (line.find('Generated automatically') != -1 or
line.find('Autogenerated from') != -1 or
line.find('is autogenerated') != -1 or
line.find('automatically generated by Pidl') != -1 or
line.find('Created by: The Qt Meta Object Compiler') != -1 or
line.find('This file was generated') != -1):
f_read.close()
return True
lines_tested = lines_tested + 1
# OK, looks like a hand-written file!
f_read.close()
return False
def isAppropriateFile(filename):
file, extension = os.path.splitext(filename)
return extension in { '.adoc', '.c', '.cpp', '.pod'} or file.endswith('README')
def findFilesInFolder(folder):
files_to_check = []
for root, subfolders, files in os.walk(folder):
for f in files:
if should_exit:
return
f = os.path.join(root, f)
if isAppropriateFile(f) and not isGeneratedFile(f):
files_to_check.append(f)
return files_to_check
# Check the given dissector file.
def checkFile(filename):
file = findStrings(filename)
file.spellCheck()
#################################################################
# Main logic.
# command-line args. Controls which files should be checked.
# If no args given, will just scan epan/dissectors folder.
parser = argparse.ArgumentParser(description='Check calls in dissectors')
parser.add_argument('--file', action='store', default='',
help='specify individual dissector file to test')
parser.add_argument('--folder', action='store', default='',
help='specify folder to test')
parser.add_argument('--commits', action='store',
help='last N commits to check')
parser.add_argument('--open', action='store_true',
help='check open files')
args = parser.parse_args()
# Get files from wherever command-line args indicate.
files = []
if args.file:
# Add single specified file..
if not os.path.isfile(args.file):
print('Chosen file', args.file, 'does not exist.')
exit(1)
else:
files.append(args.file)
elif args.commits:
# Get files affected by specified number of commits.
command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
files = [f.decode('utf-8')
for f in subprocess.check_output(command).splitlines()]
# Will examine dissector files only
files = list(filter(lambda f : isAppropriateFile(f), files))
elif args.open:
# Unstaged changes.
command = ['git', 'diff', '--name-only']
files = [f.decode('utf-8')
for f in subprocess.check_output(command).splitlines()]
# Only interested in dissector files.
files = list(filter(lambda f : is_dissector_file(f), files))
# Staged changes.
command = ['git', 'diff', '--staged', '--name-only']
files_staged = [f.decode('utf-8')
for f in subprocess.check_output(command).splitlines()]
# Only interested in dissector files.
files_staged = list(filter(lambda f : is_dissector_file(f), files_staged))
for f in files:
files.append(f)
for f in files_staged:
if not f in files:
files.append(f)
else:
# By default, scan dissectors
folder = os.path.join('epan', 'dissectors')
# But overwrite with any folder entry.
if args.folder:
folder = args.folder
if not os.path.isdir(folder):
print('Folder', folder, 'not found!')
exit(1)
# Find files from folder.
print('Looking for files in', folder)
files = findFilesInFolder(folder)
# If scanning a subset of files, list them here.
print('Examining:')
if args.file or args.commits or args.open:
if files:
print(' '.join(files), '\n')
else:
print('No files to check.\n')
else:
print('All dissector modules\n')
# Now check the chosen files.
for f in files:
# Jump out if control-C has been pressed.
if should_exit:
exit(1)
checkFile(f)
# Show the most commonly not-recognised words. TODO: depend upon a command-line option here?
print('')
counter = Counter(missing_words).most_common(100)
if len(counter) > 0:
for c in counter:
print(c[0], ':', c[1])
# Show error count.
print('\n' + bcolors.BOLD + str(len(missing_words)) + ' issues found' + bcolors.ENDC + '\n')