200 lines
6.7 KiB
Python
Executable File
200 lines
6.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# Wireshark - Network traffic analyzer
|
|
# By Gerald Combs <gerald@wireshark.org>
|
|
# Copyright 1998 Gerald Combs
|
|
#
|
|
# SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
import os
|
|
import re
|
|
import requests
|
|
import shutil
|
|
|
|
# This utility scans the dissector code for URLs, then attempts to
|
|
# fetch the links. The results are shown in stdout, but also, at
|
|
# the end of the run, written to files:
|
|
# - URLs that couldn't be loaded are written to failures.txt
|
|
# - working URLs are written to successes.txt
|
|
# - any previous failures.txt is also copied to failures_last_run.txt
|
|
|
|
|
|
# TODO:
|
|
# - allow single dissector name to be given as a command-line arg.
|
|
# - option to write back to dissector file when there is a failure?
|
|
# - make requests in parallel (run takes around 35 minutes)?
|
|
# - optionally parse previous successes.txt and avoid fetching them again?
|
|
# - make sure URLs are really within comments in code?
|
|
# - use urllib.parse or similar to better check URLs?
|
|
|
|
|
|
class FailedLookup:
|
|
|
|
def __init__(self):
|
|
# Fake values that will be queried (for a requests.get() return value)
|
|
self.status_code = 0
|
|
self.headers = {}
|
|
self.headers['content-type'] = '<NONE>'
|
|
|
|
def __str__(self):
|
|
s = ('FailedLookup: status_code=' + str(self.status_code) +
|
|
' content-type=' + self.headers['content-type'])
|
|
return s
|
|
|
|
|
|
# Dictionary from url -> result
|
|
cached_lookups = {}
|
|
|
|
|
|
# These are strings typically seen after redirecting to a page that won't have
|
|
# What we are looking for. Usually get a 404 for these anyway.
|
|
# TODO: likely more of these...
|
|
apology_strings = ["sorry, we cannot find the page",
|
|
"this page could not be found",
|
|
"the page you're looking for can't be found",
|
|
"the content you are looking for cannot be found...",
|
|
"the resource you are looking for has been removed"]
|
|
|
|
|
|
class Link(object):
|
|
|
|
def __init__(self, file, line_number, url):
|
|
self.file = file
|
|
self.line_number = line_number
|
|
self.url = url
|
|
self.tested = False
|
|
self.r = None
|
|
self.success = False
|
|
self.result_from_cache = False
|
|
|
|
def __str__(self):
|
|
s = (('SUCCESS ' if self.success else 'FAILED ') + self.file + ':' + str(self.line_number) +
|
|
' ' + self.url + " status-code=" + str(self.r.status_code) +
|
|
' content-type="' + (self.r.headers['content-type'] if ('content-type' in self.r.headers) else 'NONE') + '"')
|
|
return s
|
|
|
|
def looksLikeApology(self):
|
|
content = str(self.r.content)
|
|
# N.B. invariably comes back as just one line...
|
|
if any(needle in content for needle in apology_strings):
|
|
print('Found apology!')
|
|
return True
|
|
return False
|
|
|
|
def validate(self, session):
|
|
# Fetch, but first look in cache
|
|
global cached_lookups
|
|
self.tested = True
|
|
if self.url in cached_lookups:
|
|
print('[Using cached result for', self.url, ']')
|
|
self.r = cached_lookups[self.url]
|
|
self.result_from_cache = True
|
|
else:
|
|
|
|
try:
|
|
# Try it.
|
|
self.r = session.get(self.url, timeout=15)
|
|
|
|
# Cache this result.
|
|
cached_lookups[self.url] = self.r
|
|
except (ValueError, ConnectionError, Exception):
|
|
print(self.url, ': failed to make request')
|
|
self.success = False
|
|
# Add bad result to crashed_lookups.
|
|
cached_lookups[self.url] = FailedLookup()
|
|
self.r = cached_lookups[self.url]
|
|
return
|
|
|
|
# Check return value
|
|
if self.r.status_code < 200 or self.r.status_code >= 300:
|
|
self.success = False
|
|
return
|
|
|
|
# Look for 'not really found' type strings in r.content
|
|
if self.looksLikeApology():
|
|
print('Got body, but it looks like content has moved?')
|
|
self.success = False
|
|
return
|
|
|
|
# Assume its Ok.
|
|
self.success = True
|
|
|
|
# Scan the given folder for links to test.
|
|
|
|
|
|
def findLinks(folder):
|
|
links = []
|
|
|
|
# Look at files in sorted order, to give some idea of how far through it
|
|
# is.
|
|
for filename in sorted(os.listdir(folder)):
|
|
if filename.endswith('.c'):
|
|
with open(os.path.join(folder, filename), 'r') as f:
|
|
for line_number, line in enumerate(f, start=1):
|
|
urls = re.findall(
|
|
r'https?://(?:[a-zA-Z0-9./_?&=-]+|%[0-9a-fA-F]{2})+', line)
|
|
|
|
for url in urls:
|
|
# Lop off any trailing chars that are not part of it
|
|
url = url.rstrip(").',")
|
|
|
|
# A url must have a period somewhere
|
|
if '.' not in url:
|
|
continue
|
|
|
|
print('Found URL:', url)
|
|
links.append(Link(filename, line_number, url))
|
|
print('Found', len(links), 'links')
|
|
return links
|
|
|
|
|
|
#################################################################
|
|
# Main logic.
|
|
|
|
# Find links from dissector folder.
|
|
links = findLinks(os.path.join(os.path.dirname(__file__), '..', 'epan', 'dissectors'))
|
|
|
|
|
|
# Prepare one session for all requests. For args, see
|
|
# https://requests.readthedocs.io/en/master/
|
|
session = requests.Session()
|
|
# N.B. Can set timeout here but doesn't get used.
|
|
# Default headers don't always get responses where proper browsers do.
|
|
session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'})
|
|
|
|
# Try out the links.
|
|
limit = 5000 # Control for debug
|
|
for checks, link in enumerate(links):
|
|
link.validate(session)
|
|
print(link)
|
|
if checks > limit:
|
|
break
|
|
|
|
# Write failures to a file. Back up any previous first though.
|
|
if os.path.exists('failures.txt'):
|
|
shutil.copyfile('failures.txt', 'failures_last_run.txt')
|
|
with open('failures.txt', 'w') as f_f:
|
|
for l in links:
|
|
if l.tested and not l.success:
|
|
f_f.write(str(l) + '\n')
|
|
# And successes
|
|
with open('successes.txt', 'w') as f_s:
|
|
for l in links:
|
|
if l.tested and l.success:
|
|
f_s.write(str(l) + '\n')
|
|
|
|
|
|
# Show overall stats.
|
|
passed, failed, cached = 0, 0, 0
|
|
for l in links:
|
|
if l.tested and not l.result_from_cache:
|
|
if l.success:
|
|
passed += 1
|
|
else:
|
|
failed += 1
|
|
if l.result_from_cache:
|
|
cached += 1
|
|
print('--------------------------------------------------------------------------------------------------')
|
|
print(len(links), 'links checked: , ', passed, 'passed,',
|
|
failed, 'failed', cached, 'results from cache')
|