op25/op25/gr-op25_repeater/lib/ezpwd/corrector

507 lines
19 KiB
Plaintext

/*
* Ezpwd Reed-Solomon -- Reed-Solomon encoder / decoder library
*
* Copyright (c) 2014, Hard Consulting Corporation.
*
* Ezpwd Reed-Solomon is free software: you can redistribute it and/or modify it under the terms of
* the GNU General Public License as published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version. See the LICENSE file at the top of the
* source tree. Ezpwd Reed-Solomon is also available under Commercial license. c++/ezpwd/rs_base
* is redistributed under the terms of the LGPL, regardless of the overall licensing terms.
*
* Ezpwd Reed-Solomon is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
* the GNU General Public License for more details.
*/
#ifndef _EZPWD_CORRECTOR
#define _EZPWD_CORRECTOR
#include "rs"
#include "serialize"
namespace ezpwd {
//
// best_avg -- collect <password>,<confidence> guesses, and return the unambiguous best one
//
typedef std::map<std::string, std::pair<int, int>> // (<password>, (<count>, <avgsum>))
best_avg_base_t;
class best_avg
: public best_avg_base_t
{
public:
using best_avg_base_t::begin;
using best_avg_base_t::end;
using best_avg_base_t::insert;
using best_avg_base_t::find;
using best_avg_base_t::iterator;
using best_avg_base_t::const_iterator;
using best_avg_base_t::value_type;
using best_avg_base_t::mapped_type;
//
// add -- add the given pct to the current average for <string> str
//
iterator add(
const std::string &str,
int pct )
{
iterator i = find( str );
if ( i == end() )
i = insert( i, value_type( str, mapped_type() ));
i->second.first += 1;
i->second.second += pct;
return i;
}
//
// best -- return the unambiguously best value (average is >, or == but longer), or end()
//
const_iterator best()
const
{
const_iterator top = end();
bool uni = false;
for ( const_iterator i = begin(); i != end(); ++i ) {
if ( top == end()
or i->second.second/i->second.first > top->second.second/top->second.first
or ( i->second.second/i->second.first == top->second.second/top->second.first
and i->first.size() > top->first.size())) {
top = i;
uni = true;
} else if ( i->second.second/i->second.first == top->second.second/top->second.first
and i->first.size() == top->first.size()) {
uni = false;
}
}
return uni ? top : end();
}
//
// evaluation -- process a (<password>,(<count>,<avgsum>)) into (<average>,<password>)
// sort -- return a multimap indexed by <average> --> <string>
// output -- output the <string>: <average>, sorted by average
//
static std::pair<const int,const std::string &>
evaluation( const value_type &val )
{
return std::pair<const int,const std::string &>( val.second.second/val.second.first, val.first );
}
typedef std::multimap<const int,const std::string &>
sorted_t;
sorted_t sort()
const
{
sorted_t dst;
std::transform( begin(), end(), std::inserter( dst, dst.begin() ), evaluation );
return dst;
}
std::ostream &output(
std::ostream &lhs )
const
{
for ( auto i : sort() )
lhs << std::setw( 16 ) << i.second
<< ": " << std::setw( 3 ) << i.first
<< std::endl;
return lhs;
}
};
} // namespace ezpwd
std::ostream &operator<<(
std::ostream &lhs,
const ezpwd::best_avg &rhs )
{
return rhs.output( lhs );
}
namespace ezpwd {
//
// ezpwd::corrector -- Apply statistical corrections to a string, returning the confidence
//
// All methods are static; no instance is required, as this is primarily used to create
// external language APIs.
//
template <
size_t PARITY,
size_t N = 64,
typename SERIAL = serialize::base< N, serialize::ezpwd< N >>>
class corrector {
public:
static
std::ostream &output(
std::ostream &lhs )
{
lhs << "corrector<PARITY=" << PARITY << ",N=" << N << ",SERIAL=" << SERIAL() << ">";
return lhs;
}
//
// parity(<string>) -- Returns 'PARITY' base-N symbols of R-S parity to the supplied password
//
static std::string parity(
const std::string &password )
{
std::string parity;
rscodec.encode( password, parity );
SERIAL::encode( parity );
return parity;
}
//
// encode(<string>) -- append PARITY base-N parity symbols to password
//
// The supplied password buffer size must be sufficient to contain PARITY additional
// symbols, plus the terminating NUL. Returns the resultant encoded password size
// (excluding the NUL).
//
static size_t encode(
std::string &password )
{
password += parity( password );
return password.size();
}
static size_t encode(
char *password,
size_t size ) // maximum available size
{
size_t len = ::strlen( password ); // length w/o terminating NUL
if ( len + PARITY + 1 > size )
throw std::runtime_error( "ezpwd::rspwd::encode password buffer has insufficient capacity" );
std::string par = parity( std::string( password, password + len ));
if ( par.size() != PARITY )
throw std::runtime_error( "ezpwd::rspwd::encode computed parity with incorrect size" );
std::copy( par.begin(), par.end(), password + len );
len += PARITY;
password[len] = 0;
return len;
}
//
// decode(<string>[,...]) -- Applies R-S error correction on the encoded string, removing parity
//
// Up to 'PARITY' Reed-Solomon parity symbols are examined, to determine if the supplied
// string is a valid R-S codeword and hence very likely to be correct. Optionally supply a
// vector of erasure positions.
//
// An optional 'minimum' final password length may be provided; no R-S parity is assumed
// to exist in the first 'minimum' password characters (default: PARITY). This prevents
// accidentally finding valid R-S codewords in passwords of known minimum length; validation
// codes, for example. Likewise, the optional 'maximum' allows us to limit the number of
// parity symbols that may be assumed to be missing from the end of the codeword.
//
// Returns a confidence strength rating, which is the ratio:
//
// 100 - ( errors * 2 + erasures ) * 100 / parity
//
// if an R-S codeword was solved, and 0 otherwise. If a codeword is solved, but the number
// of errors and erasures corrected indicates that all parity was consumed, the caller may
// opt to not use the corrected string, because there is a chance that our R-S polynomial
// was overwhelmed with errors and actually returned an incorrect codeword. Therefore,
// solving a codeword using all available parity results in 100 - PARITY * 100 / PARITY ==
// 0, which indicates that there is no certainty of correctness; all R-S parity resources
// were used in error/erasure recover, with none left to confirm that the result is actually
// correct. If only zero-strength results are achieved, the longest will be returned (the
// full, original string).
//
// Supports the following forms of error/erasure:
//
// 0) Full parity. All data and parity supplied, and an R-S codeword is solved.
//
// 1) Partial parity. All data and some parity supplied; remainder are deemed erasures.
//
// If PARITY > 2, then up to PARITY/2-1 trailing parity terms are marked as erasures.
// If the R-S codeword is solved and a safe number of errors are found, then we can have
// reasonable confidence that the string is correct.
//
// 1a) Erase errors. Permute the combinations of up to PARITY-1 erasures.
//
// o) Raw password. No parity terms supplied; not an R-S codeword
//
// If none of the error/erasure forms succeed, the password is returned unmodified.
//
// If a non-zero 'minimum' or 'maximum' are provided, they constrain the possible
// resultant password sizes that will be attempted.
//
static
int decode(
std::string &password,
const std::vector<int>
&erasures,
size_t minimum = PARITY,//always deemed at least 1
size_t maximum = 0 ) // if 0, no limit
{
int confidence;
best_avg best;
// Full/Partial parity. Apply some parity erasure if we have some erasure/correction
// capability while maintaining at least one excess parity symbol for verification.
// This can potentially result in longer password being returned, if the R-S decoder
// accidentally solves a codeword.
//
// For example, if PARITY=3 (or 4) then (PARITY+1)/2 == 2, and we would only attempt up
// to 1 parity erasure. This would leave 1 parity symbol to replace the 1 erasure, and
// 1 remaining to validate the integrity of the password.
//
// The password must be long enough to contain at least 1 non-parity symbol, and the
// designated number of non-erased parity symbols! However, by convention we'll demand
// that the password contain at least PARITY symbols -- any less, and we can
// accidentally correct the few remaining password symbols.
//
// Also, if any parity symbols won't decode (eg. were entered in error), we must deem
// them to be erasures, too, and if the number of erasures exceeds the capacity of the
// R-S codec, it'll fail (throw an exception, or at best solve with 0 confidence).
for ( size_t era = 0 // how many parity symbols to deem erased
; era < (PARITY+1)/2
; ++era ) {
if ( password.size() < ( minimum ? minimum : 1 ) + PARITY - era ) {
#if defined( DEBUG ) && DEBUG >= 1
output( std::cout )
<< " Rejected too short password \""
<< password << std::string( era, '_' )
<< "\"" << " (" << era << " parity skipped)"
<< std::endl;
#endif
continue; // too few password symbols to start checking parity
}
if ( maximum and password.size() > maximum + PARITY - era ) {
#if defined( DEBUG ) && DEBUG >= 1
output( std::cout )
<< " Rejected too long password \""
<< password << std::string( era, '_' )
<< "\"" << " (" << era << " parity skipped)"
<< std::endl;
#endif
continue; // too few parity symbols erased to start checking parity
}
// Copy password, adding 'era' additional NULs
std::string fixed( password.size() + era, 0 );
std::copy( password.begin(), password.end(), fixed.begin() );
// Decode the base-N parity, denoting any invalid (mistyped or trailing NUL) symbols
// as erasures (adjust erasure offsets to be from start of password, not start of
// parity). All newly added 'era' symbols will be NUL, and will be invalid. After
// decoding parity, if we've slipped below our minimum R-S capacity threshold
// (ie. because of mistyped parity symbols), don't attempt.
std::vector<int> all_era;
SERIAL::decode( fixed.begin() + fixed.size() - PARITY,
fixed.begin() + fixed.size(), &all_era, 0,
serialize::ws_invalid, serialize::pd_invalid );
if ( all_era.size() >= (PARITY+1)/2 ) {
#if defined( DEBUG ) && DEBUG >= 1
output( std::cout )
<< " Rejected low parity password \""
<< password << std::string( era, '_' )
<< "\"" << " (" << all_era.size() << " parity erasures + "
<< era << " skipped)"
<< std::endl;
#endif
continue; // Too many missing parity symbols
}
if ( all_era.size() + erasures.size() > PARITY ) {
#if defined( DEBUG ) && DEBUG >= 1
output( std::cout )
<< " Rejected hi erasure password \""
<< password << std::string( era, '_' )
<< "\"" << " (" << all_era.size() + erasures.size() << " total erasures + "
<< era << " skipped)"
<< std::endl;
#endif
continue; // Total erasures beyond capacity
}
for ( auto &o : all_era )
o += fixed.size() - PARITY;
std::copy( erasures.begin(), erasures.end(), std::back_inserter( all_era ));
// Enough parity to try to decode. A successful R-S decode with 0 (remaining)
// confidence indicates a successfully validated R-S codeword! Use it (ex. parity).
try {
std::vector<int> position;
int corrects= rscodec.decode( fixed, all_era, &position );
confidence = strength<PARITY>( corrects, all_era, position );
fixed.resize( fixed.size() - PARITY );
if ( confidence >= 0 )
best.add( fixed, confidence );
#if defined( DEBUG ) && DEBUG >= 1
output( std::cout )
<< " Reed-Solomon w/ " << era << " of " << PARITY
<< " parity erasures " << std::setw( 3 ) << confidence
<< "% confidence: \"" << password
<< "\" ==> \"" << fixed
<< "\" (corrects: " << corrects
<< ", erasures at " << all_era
<< ", fixed at " << position << "): "
<< std::endl
<< best;
#endif
} catch ( std::exception &exc ) {
#if defined( DEBUG ) && DEBUG >= 2 // should see only when ezpwd::reed_solomon<...>::decode fails
output( std::cout ) << " invalid part parity password: " << exc.what() << std::endl;
#endif
}
}
// Partial parity, but below threshold for usable error detection. For the first 1 to
// (PARITY+1)/2 parity symbols (eg. for PARITY == 3, (PARITY+1)/2 == 1 ), we cannot
// perform meaningful error or erasure detection. However, if we see that the terminal
// symbols match the R-S symbols we expect from a correct password, we'll ascribe a
// partial confidence due to the matching parity symbols.
//
// password: sock1t
// w/ 3 parity: sock1tkeB
// password ----^^^^^^
// ^^^--- parity
//
for ( size_t era = (PARITY+1)/2 // how many parity symbols are not present
; era < PARITY
; ++era ) {
if ( password.size() < ( minimum ? minimum : 1 ) + PARITY - era ) {
#if defined( DEBUG ) && DEBUG >= 1
output( std::cout )
<< " Rejected too short password \""
<< password << std::string( era, '_' )
<< "\""
<< std::endl;
#endif
continue; // too few password symbols to start checking parity
}
if ( maximum and password.size() > maximum + PARITY - era ) {
#if defined( DEBUG ) && DEBUG >= 1
output( std::cout )
<< " Rejected too long password \""
<< password << std::string( era, '_' )
<< "\"" << " (" << era << " parity skipped)"
<< std::endl;
#endif
continue; // too few parity symbols erased to start checking parity
}
std::string fixed = password;
size_t len = password.size() - ( PARITY - era );
fixed.resize( len );
encode( fixed );
auto differs = std::mismatch( fixed.begin(), fixed.end(), password.begin() );
size_t par_equ = differs.second - password.begin();
if ( par_equ < len || par_equ > len + PARITY )
throw std::runtime_error( "miscomputed R-S parity matching length" );
par_equ -= len;
// At least one parity symbol is requires to give any confidence
if ( par_equ > 0 ) {
std::string basic( fixed.begin(), fixed.begin() + len );
confidence = par_equ * 100 / PARITY; // each worth a normal parity symbol
best.add( basic, confidence );
#if defined( DEBUG ) && DEBUG >= 1
output( std::cout )
<< " Check Chars. w/ " << era << " of " << PARITY
<< " parity missing " << std::setw( 3 ) << confidence
<< "% confidence: \"" << password
<< "\" ==> \"" << basic
<< " (from computed: \"" << fixed << "\")"
<< ": "
<< std::endl
<< best;
#endif
}
}
// Select the best guess and return its confidence. Otherwise, use raw password? If no
// error/erasure attempts succeeded (if no 'best' w/ confidence >= 0), then we'll use
// the raw password w/ 0 confidence, if it meets the minimum/maximum length
// requirements.
confidence = -1;
if ( password.size() >= ( minimum ? minimum : 1 )
and ( maximum == 0 or password.size() <= maximum ))
confidence = 0;
typename best_avg::const_iterator
bi = best.best();
#if defined( DEBUG )
output( std::cout )
<< " Selected " << ( bi != best.end() ? "corrected" : "unmodified" )
<< " password \"" << ( bi != best.end() ? bi->first : password )
<< "\" of length " << ( minimum ? minimum : 1) << "-" << maximum
<< " (vs. \"" << password
<< "\") w/ confidence " << (bi != best.end() ? bi->second.second : confidence )
<< "%, from: "
<< std::endl
<< best;
#endif
if ( bi != best.end() ) {
auto better = best.evaluation( *bi ); // --> (<average>,<password>)
password = better.second;
confidence = better.first;
}
return confidence;
}
static
int decode(
std::string &password,
size_t minimum = PARITY,
size_t maximum = 0 )
{
return decode( password, std::vector<int>(), minimum, maximum );
}
//
// decode(<char*>,<size_t>,<size_t>,<size_t>) -- C interface to decode(<string>)
//
// Traditional C interface. The provided NUL-terminated password+parity is decoded
// (parity removed), and the confidence % is returned.
//
// If any failure occurs, a -'ve value will be returned, and the supplied password
// buffer will be used to contain an error description.
//
static int decode(
char *password, // NUL terminated
size_t siz, // available size
size_t minimum = PARITY,//minimum resultant password length
size_t maximum = 0 ) // maximum ''
{
std::string corrected( password );
int confidence;
try {
confidence = decode( corrected, minimum, maximum );
if ( corrected.size() + 1 > siz )
throw std::runtime_error( "password buffer has insufficient capacity" );
std::copy( corrected.begin(), corrected.end(), password );
password[corrected.size()] = 0;
} catch ( std::exception &exc ) {
confidence = -1;
ezpwd::streambuf_to_buffer sbf( password, siz );
std::ostream( &sbf ) << "corrector<" << PARITY << "> failed: " << exc.what();
}
return confidence;
}
//
// rscodec -- A ?-bit RS(N-1,N-1-PARITY) Reed-Solomon codec
//
// Encodes and decodes R-S symbols over the lower 6 bits of the supplied data. Requires
// that the last N (parity) symbols of the data are in the range [0,63]. The excess bits on
// the data symbols are masked and restored during decoding.
//
static const ezpwd::RS<N-1,N-1-PARITY>
rscodec;
};
template < size_t PARITY, size_t N, typename SERIAL >
const ezpwd::RS<N-1,N-1-PARITY>
corrector<PARITY,N,SERIAL>::rscodec;
} // namespace ezpwd
template < size_t PARITY, size_t N, typename SERIAL >
std::ostream &operator<<(
std::ostream &lhs,
const ezpwd::corrector<PARITY,N,SERIAL>
&rhs )
{
return rhs.output( lhs );
}
#endif // _EZPWD_CORRECTOR