Add spell-checking script.

check_spelling.py scans Wireshark source or documentation files,
using the general dictionary from pyspellcheck, augmented by the contents
of wireshark_words.txt.

Can scan:
- entire folders (recursively)
- individual files
- open files
- files affected by recent git changes
This commit is contained in:
Martin Mathieson 2020-09-05 22:23:52 +01:00 committed by Gerald Combs
parent 6a841ce4d5
commit 22e02a9d06
2 changed files with 1262 additions and 0 deletions

390
tools/check_spelling.py Executable file
View File

@ -0,0 +1,390 @@
#!/usr/bin/env python3
# Wireshark - Network traffic analyzer
# By Gerald Combs <gerald@wireshark.org>
# Copyright 1998 Gerald Combs
#
# SPDX-License-Identifier: GPL-2.0-or-later
import os
import re
import subprocess
import argparse
import signal
from collections import Counter
# Looks for spelling errors among strings found in source or documentation files.
# TODO: deal with contractions - pyspellcheck doesn't seem to handle apostrophies..
# For text colouring/highlighting.
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
ADDED = '\033[45m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
# Try to exit soon after Ctrl-C is pressed.
should_exit = False
def signal_handler(sig, frame):
global should_exit
should_exit = True
print('You pressed Ctrl+C - exiting')
signal.signal(signal.SIGINT, signal_handler)
# Create spellchecker, and augment with some Wireshark words.
from spellchecker import SpellChecker
# Set up our dict with words from text file.
spell = SpellChecker()
spell.word_frequency.load_text_file('./tools/wireshark_words.txt')
# Track words that were not found.
missing_words = []
# Split camelCase string into separate words.
def camelCaseSplit(identifier):
matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
return [m.group(0) for m in matches]
# A File object contains all of the strings to be checked for a given file.
class File:
def __init__(self, file):
self.file = file
self.values = []
filename, extension = os.path.splitext(file)
self.code_file = extension in {'.c', '.cpp'}
with open(file, 'r') as f:
contents = f.read()
if self.code_file:
# Remove comments so as not to trip up RE.
contents = removeComments(contents)
# Find protocol name and add to dict.
# N.B. doesn't work when a variable is used instead of a literal for the protocol name...
matches = re.finditer(r'proto_register_protocol\s*\([\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\"', contents)
for m in matches:
protocol = m.group(3)
# Add to dict.
spell.word_frequency.load_words([protocol])
spell.known([protocol])
print('Protocol is: ' + bcolors.BOLD + protocol + bcolors.ENDC)
# Add a string found in this file.
def add(self, value):
self.values.append(value)
# Whole word is not recognised, but is it 2 words concatenated (without camelcase) ?
def checkMultiWords(self, word):
if len(word) < 6:
return False
# Don't consider if mixed cases.
if not (word.islower() or word.isupper()):
# But make an exception if only the fist letter is uppercase..
if not word == (word[0].upper() + word[1:]):
return False
# Try splitting into 2 words recognised at various points.
length = len(word)
for idx in range(3, length-3):
word1 = word[0:idx]
word2 = word[idx:]
if not spell.unknown([word1, word2]):
return True
return False
# Check the spelling of all the words we have found fir tgus fuke,
def spellCheck(self):
num_values = len(self.values)
this_item = 0
for v in self.values:
if should_exit:
exit(1)
this_value += 1
# Ignore includes.
if v.endswith('.h'):
continue
# Store original (as want to include for context in error report).
original = str(v)
# Replace most punctuation with spaces, and eliminate common format specifiers.
v = v.replace('.', ' ')
v = v.replace(',', ' ')
v = v.replace('`', ' ')
v = v.replace(':', ' ')
v = v.replace(';', ' ')
v = v.replace('"', ' ')
v = v.replace('\\', ' ')
v = v.replace('+', ' ')
v = v.replace('|', ' ')
v = v.replace('(', ' ')
v = v.replace(')', ' ')
v = v.replace('[', ' ')
v = v.replace(']', ' ')
v = v.replace('{', ' ')
v = v.replace('}', ' ')
v = v.replace('<', ' ')
v = v.replace('>', ' ')
v = v.replace('_', ' ')
v = v.replace('-', ' ')
v = v.replace('/', ' ')
v = v.replace('!', ' ')
v = v.replace('?', ' ')
v = v.replace('=', ' ')
v = v.replace('*', ' ')
v = v.replace('%', ' ')
v = v.replace('#', ' ')
v = v.replace('&', ' ')
v = v.replace('@', ' ')
v = v.replace("'", ' ')
v = v.replace('"', ' ')
v = v.replace('%u', '')
v = v.replace('%d', '')
v = v.replace('%s', '')
# Split into words.
value_words = v.split()
# Further split up any camelCase words.
words = []
for w in value_words:
words += camelCaseSplit(w)
# Check each word within this string in turn.
for word in words:
# Strip trailing digits from word.
word = word.rstrip('1234567890')
# Quote marks found in some of the docs...
word = word.replace('', '')
word = word.replace('', '')
if len(word) > 4 and spell.unknown([word]) and not self.checkMultiWords(word):
print(self.file, this_value, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC,
' -> ', '?')
# TODO: this can be interesting, but takes too long!
# bcolors.OKGREEN + spell.correction(word) + bcolors.ENDC
global missing_words
missing_words.append(word)
def removeComments(code_string):
code_string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,code_string) # C-style comment
# Remove this for now as can get tripped up if see htpps://www.... within a string!
#code_string = re.sub(re.compile("//.*?\n" ) ,"" ,code_string) # C++-style comment
return code_string
def removeSingleQuotes(code_string):
code_string = code_string.replace('\"\\\\\"', "")
code_string = code_string.replace("\\\"", " ")
code_string = code_string.replace("'\"'", "")
return code_string
def removeHexSpecifiers(code_string):
# TODO: replace with single regexp?
code_string = code_string.replace('0x%02X', "")
code_string = code_string.replace('0x%02x', "")
code_string = code_string.replace('0x%04X', "")
code_string = code_string.replace('0x%04x', "")
code_string = code_string.replace('0x%08X', "")
code_string = code_string.replace('0x%08x', "")
return code_string
# Create a File object that knows about all of the strings in the given file.
def findStrings(filename):
with open(filename, 'r') as f:
contents = f.read()
# Remove comments & embedded quotes so as not to trip up RE.
contents = removeComments(contents)
contents = removeSingleQuotes(contents)
contents = removeHexSpecifiers(contents)
# Create file object.
file = File(filename)
# What we check depends upon file type.
if file.code_file:
# Code so only checking strings.
matches = re.finditer(r'\"([^\"]*)\"', contents)
for m in matches:
file.add(m.group(1))
else:
# A documentation file, so examine all words.
words = contents.split()
for w in words:
file.add(w)
return file
# Test for whether the given file was automatically generated.
def isGeneratedFile(filename):
# Open file
f_read = open(os.path.join(filename), 'r')
lines_tested = 0
for line in f_read:
# The comment to say that its generated is near the top, so give up once
# get a few lines down.
if lines_tested > 10:
f_read.close()
return False
if (line.find('Generated automatically') != -1 or
line.find('Autogenerated from') != -1 or
line.find('is autogenerated') != -1 or
line.find('automatically generated by Pidl') != -1 or
line.find('Created by: The Qt Meta Object Compiler') != -1):
f_read.close()
return True
lines_tested = lines_tested + 1
# OK, looks like a hand-written file!
f_read.close()
return False
def isAppropriateFile(filename):
file, extension = os.path.splitext(filename)
return extension in { '.adoc', '.c', '.cpp', '.pod'} or file.endswith('README')
def findFilesInFolder(folder):
files_to_check = []
for root, subfolders, files in os.walk(folder):
for f in files:
if should_exit:
return
f = os.path.join(root, f)
if isAppropriateFile(f) and not isGeneratedFile(f):
files_to_check.append(f)
return files_to_check
# Check the given dissector file.
def checkFile(filename):
file = findStrings(filename)
file.spellCheck()
#################################################################
# Main logic.
# command-line args. Controls which files should be checked.
# If no args given, will just scan epan/dissectors folder.
parser = argparse.ArgumentParser(description='Check calls in dissectors')
parser.add_argument('--file', action='store', default='',
help='specify individual dissector file to test')
parser.add_argument('--folder', action='store', default='',
help='specify folder to test')
parser.add_argument('--commits', action='store',
help='last N commits to check')
parser.add_argument('--open', action='store_true',
help='check open files')
args = parser.parse_args()
# Get files from wherever command-line args indicate.
files = []
if args.file:
# Add single specified file..
if not os.path.isfile(args.file):
print('Chosen file', args.file, 'does not exist.')
exit(1)
else:
files.append(args.file)
elif args.commits:
# Get files affected by specified number of commits.
command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
files = [f.decode('utf-8')
for f in subprocess.check_output(command).splitlines()]
# Will examine dissector files only
files = list(filter(lambda f : isAppropriateFile(f), files))
elif args.open:
# Unstaged changes.
command = ['git', 'diff', '--name-only']
files = [f.decode('utf-8')
for f in subprocess.check_output(command).splitlines()]
# Only interested in dissector files.
files = list(filter(lambda f : isDissectorFile(f), files))
# Staged changes.
command = ['git', 'diff', '--staged', '--name-only']
files_staged = [f.decode('utf-8')
for f in subprocess.check_output(command).splitlines()]
# Only interested in dissector files.
files_staged = list(filter(lambda f : isDissectorFile(f), files_staged))
for f in files:
files.append(f)
for f in files_staged:
if not f in files:
files.append(f)
else:
# By default, scan dissectors
folder = os.path.join('epan', 'dissectors')
# But overwrite with any folder entry.
if args.folder:
folder = args.folder
if not os.path.isdir(folder):
print('Folder', folder, 'not found!')
exit(1)
# Find files from folder.
print('Looking for files in', folder)
files = findFilesInFolder(folder)
# If scanning a subset of files, list them here.
print('Examining:')
if args.file or args.commits or args.open:
if files:
print(' '.join(files), '\n')
else:
print('No files to check.\n')
else:
print('All dissector modules\n')
# Now check the chosen files.
for f in files:
# Jump out if control-C has been pressed.
if should_exit:
exit(1)
checkFile(f)
# Show the most commonly not-recognised words. TODO: depend upon a command-line option here?
print('')
counter = Counter(missing_words).most_common(100)
if len(counter) > 0:
for c in counter:
print(c[0], ':', c[1])
# Show error count.
print('\n' + bcolors.BOLD + str(len(missing_words)) + ' issues found' + bcolors.ENDC + '\n')

872
tools/wireshark_words.txt Normal file
View File

@ -0,0 +1,872 @@
0x%02x
0x%08x
1xrtt
3gpp2
80211n
accelerometer
accessors
acknowledgement
acp133
actuator
adwin
aes128
aes256
aggregator
agnss
aironet
airpcap
airtel
alcap
alljoyn
alloc
allocators
amperage
analyzers
analyzes
annexc
appdata
appid
arfcn
asn1cnf
asn2wrs
assymetric
async
asynchronously
atheros
atomically
attrib
attrs
authenticates
authenticator
authtoken
authtype
autoconfiguration
autodiscovery
available
avaya
backhaul
backoff
bacnet
bcast
beamformed
beamformee
beamformer
beamforming
bitfield
bitmask
bitrate
bitstring
blackhole
bnode
bootfile
bootloader
bootp
broadcom
bsmap
bssid
bssids
bssmap
btatt
btcommon
bthci
btmesh
btsdp
btsnoop
byte
byteorder
cablelabs
callback
callid
callsign
canceled
canceling
cancelled
cannot
canonicalized
capinfos
capsa
capwap
carrierfreq
carrierid
cccid
ccpch
cctrch
cdma2000
celcius
cellid
cellidentity
chan1
chan2
channelisation
charset
charsets
checkbox
checkout
chocolatey
chunked
ciphered
ciphering
ciphersuite
ciphertext
citrix
classmark
classmark3
cmake
cmdcontrol
codebook
codepoint
codeset
codingrate
coloring
colorise
colorization
colorize
colorized
colorizing
combiner
concatenate
concatenated
concatenates
concurrent
configitem
conformant
connectionless
connid
const
contactless
contiguously
copyfile
couchbase
cpdlc
cpich
cpuregisters
credential
credentials
criticalextensions
criticalextensionsfuture
crnti
crypto
cryptographic
csapi
ctype
customizable
customizing
datagram
datagrams
dataitem
datarate
datastate
datetime
dcerpc
deact
deactivated
deactivating
deactivation
deassertion
deauth
deauthenticated
deauthentication
debian
debug
dechunk
decompressing
decompressor
decremented
decrementing
decrypt
decrypted
decrypting
decryption
defragment
defragmentation
defragmented
defragmenting
dehumidification
delimiters
demultiplexer
demultiplexers
deprecated
deregister
deregistered
deregistering
des40
descr
desegment
desegmentation
desegmenting
deselect
devmode
dfilter
dfsauth
dhcpv
diffie
diplexer
directionality
dissection
dissector
dissectors
distinguisher
diversifier
dlmap
dlsch
dmepi
docsis
doesn't
double
downlink
dpauxmon
dpnss
drbid
dsmcc
dstport
dumpcap
earfcn
ebcdic
ecdhe
ecdsa
editcap
egprs
eigrp
elink
ellipsoid
encap
encaps
encapsulations
enciphered
encrypt
encrypting
endian
endianness
entryid
enumerations
epasv
errorcode
errored
errorportinfo
erspan
etheraddr
ethertype
ettarr
etype
eutra
eutran
extattr
extcap
extensibility
extrainformation
failover
fiber
fileset
firewall
flag1
flag2
flavored
flowid
flowmod
flowspec
format0
fortigate
fortinet
fpiur
framenum
framenumber
framenun
frametype
fsctl
functionalities
funkt
fvalue
ganss
gboolean
gchar
gcrypt
gendc
geoip
geonw
geran
getattr
getnext
gigamon
github
gitlab
gluster
gmprs
goaway
google
gprscdr
groupa
groupb
groupcast
groupmod
guint
handoff
hangup
harqid
hartip
hashed
hazelcast
heuristic
hfarr
HI2Operations
hnbap
homeplug
hopcount
hostname
hsdpa
hsdsch
hspdsch
http2
https
icmpv
ident
idl2wrs
iec60870
ieee17221
ieee80211
iface
ifconfig
ikev2
illuminance
implementor
incits
incrementing
infile
infiniband
infolist
informationitem
informationlist
initialise
initialising
initialization
initialize
initialized
initializer
initializers
initializes
initializing
inline
interleaving
interruptible
interworking
invalidation
ioctl
ipaddr
ipaddress
ipfix
ipprim
ipsec
iptrace
ipv4addr
isobus
iterator
itunes
iwarp
jetds
kademlia
keepalive
kerberos
keylen
keylog
keypress
keyring
keytab
knxip
l2cap
lanalyzer
lcgid
lcids
leasequery
libgcrypt
libpcap
linkaddr
linkinfo
linux
list1
lithionics
logcat
loghans
loglocal
logoff
logout
loopback
lscap
lucent
luminance
macaddr
macaddress
mailto
malloc
mcast
megaco
mellanox
memcache
menubar
mergecap
messageid
metadata
meteorological
microbit
midamble
miniport
minislot
minislots
minus1
mirrorlink
misconfiguration
misconfigured
mode01
mode7
modepage
modespecificinfo
mpeg4
mpsse
mrcpv
msgsend
mtftp
mtrace
multiband
multicarrier
multicast
multicasted
multicore
multiframe
multiframes
multihop
multilateration
multipacket
multipart
multipath
multiplexed
multiplexer
multiplexers
multiplexing
multirat
multirate
multislot
multistate
nacks
namelen
namespace
narrowband
nbrar
netboot
netfilter
netflow
nethop
netlink
netlogon
netmask
netmon
netscaler
nettl
newpw
nexthop
nfs4err
ngsniffer
niagra
nonblock
noncriticalextension
noncriticalextensions
notif
notifier
notused
npcap
nprach
nsapi
nstime
nstrace
objectid
objkey
obsoleted
octets
octetstring
ofdma
offloadability
ofpat
ofppf
ofpxmt
om2000
onduration
onoff
ontime
opcode
openvpn
opnum
optimizations
ospf6
outhdr
packetcable
packetization
packetized
param
parameterization
parameterized
params
parlay
parms
passcode
passkey
passthrough
passwd
pcapng
pcell
pcmax
pcmaxc
pdcch
pdsch
peeraddr
phich
phonebook
physcellid
picmg
pinfo
plaintext
plugin
plugins
pname
polestar
popup
portcounters
portinfo
portmod
portnumber
portstatus
powercontrol
prach
preconfiguration
preconfigured
preempting
preemption
prefs
preloaded
prepay
prepend
preshared
prioritized
privkey
procid
profidrive
profinet
protected
protoabbrev
protobuf
protocolie
pscell
pseudowire
ptvcursor
pubdir
pubkey
pucch
pusch
pytest
qam16
qam64
qnet6
radiotap
ranap
randomizer
randpkt
reachability
readme
realloc
realtime
reassigning
reauth
reauthentication
reauthorize
rebinding
recalculate
recalculating
recognizer
reconf
reconfig
reconfigure
reconfigured
reconfrqst
redelivery
redistributable
redistributables
reencyption
reestablishment
referer
referrer
regex
reimplemented
reinitialize
reinitializing
rekey
rekeying
reoptimization
reordercap
reorigination
representable
reprogrammable
reprogramming
requester
requestor
rerouting
resend
reservable
reserved
reserved0
reserved1
reserved2
reserved3
reserved4
reserved5
resize
resolver
resynchronization
retrans
retransmission
retransmissions
retransmit
retransmits
retransmitted
retries
retry
retyping
rfcomm
rlcmac
rnsap
roamer
routable
rpcap
rtpmidi
sanitize
satisfiable
scaler
scannable
scell
scoped
scrollbar
segno
semiautomatic
seqno
seqnum
sequenceno
serialize
serialized
sessionid
setattr
setuid
severities
sflow
sha256
sha384
sha512
sharkd
shouldn't
siapp
sidelink
signaling
slsch
sname
snaplen
snow3g
someip
spare
spare1
spare2
spare3
spare4
spare5
spare6
spare7
spare8
spare9
spcell
spnego
spooled
srbid
srcport
ssupervisor
stateful
statusbar
streamid
struct
subaddress
subband
subcarrier
subcarriers
subchannel
subcode
subdevice
subdissector
subdissectors
subelem
subelement
subelements
subframes
subheader
subheaders
subids
subindex
subm
submode
subnet
subnets
subobj
subobject
suboption
suboptions
subparam
subpdu
subpm
subquery
subselect
subselection
subslot
subtlv
subtree
subtrees
switchinfo
synchronizing
synphasor
sysdig
sysex
sysframe
syslog
systemd
tablemod
tcpip
tcpudp
tdd128
tdd384
tdd768
teredo
text2pcap
timeout
timeslot
timestamp
timestamps
timezone
toggling
toolongfragment
tooltip
touchlink
traceroute
transcoder
truncate
tshark
tspec
tunneled
tunneling
tvbuff
type1
type2
type3
typedef
uarfcn
uboot
ubuntu
udpcp
uint16
uint32
uint8
ulmap
ulsch
unaligned
unassign
unauthenticated
uncalculated
unciphered
uncompress
uncompressing
uncompression
unconfigurable
unconfigured
unconfirm
uncorrectable
undecipherable
undecodable
undecoded
undecryptable
undecrypted
undeliverable
underflow
underrun
undissected
unencrypted
unescaped
unfragmented
unhandled
unicast
unicode
unignore
unimplemented
uninitialized
uninstall
uninstaller
unknown1
unlink
unmarshal
unparsable
unparsed
unpunctuated
unreassembled
unrecoverable
unrecovered
unregister
unregistration
unreportable
unresponded
unroutable
unsecure
unsegmented
unsequenced
unsubscribe
unsynchronized
untagged
untruncated
untrusted
untunelled
uplink
upload
uploaded
uploading
urlencoded
urnti
userdata
userinfo
userlist
userplane
utilization
utran
v1250
v1310
v1410
v1530
v1610
verizon
version2
version3
version4
version5
version6
version7
versioning
virtualization
volerr
wakeup
webcam
websocket
wideband
wikipedia
wimax
winpcap
winspool
wireshark
wiretap
withfcs
withoutfcs
wksta
writable
wslua
wsluarm
x509sat
xchannel
xmlns
z3950
zigbee