isdn4k-utils/areacode/areacode.c

760 lines
23 KiB
C

/*****************************************************************************/
/* */
/* AREACODE.C */
/* */
/* Portable library module to search for an area code in a database. */
/* */
/* */
/* */
/* (C) 1996-98 Ullrich von Bassewitz */
/* Wacholderweg 14 */
/* D-70597 Stuttgart */
/* EMail: uz@musoftware.de */
/* */
/* */
/* This software is provided 'as-is', without any expressed or implied */
/* warranty. In no event will the authors be held liable for any damages */
/* arising from the use of this software. */
/* */
/* Permission is granted to anyone to use this software for any purpose, */
/* including commercial applications, and to alter it and redistribute it */
/* freely, subject to the following restrictions: */
/* */
/* 1. The origin of this software must not be misrepresented; you must not */
/* claim that you wrote the original software. If you use this software */
/* in a product, an acknowledgment in the product documentation would be */
/* appreciated but is not required. */
/* 2. Altered source versions must be plainly marked as such, and must not */
/* be misrepresented as being the original software. */
/* 3. This notice may not be removed or altered from any source */
/* distribution. */
/* */
/*****************************************************************************/
/*
* The code assumes
* - 8 bit bytes
* - unsigned long is 32 bit. This may be changed by #defining u32 to
* a data type that is an 32 bit unsigned when compiling this module.
* - ascii character set
*
* The code does *not* assume
* - a specific byte order. Currently the code autoadjusts to big or
* little endian data. If you have something more weird than that,
* you have to add conversion code.
*
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <assert.h>
#include <ctype.h>
#include "areacode.h"
/*****************************************************************************/
/* Externally visible data */
/*****************************************************************************/
/* The name of the areacode data file. The default is what is #defined as
* DATA_FILENAME. If this is not #defined, the default is "areacode.dat",
* which is probably not what you want. In the latter case set this to
* the correct filename *before* your first call to GetAreaCodeInfo.
*/
#ifdef DATA_FILENAME
char* acFileName = DATA_FILENAME;
#else
char* acFileName = "areacode.dat";
#endif
/* How much dynamic memory is GetAreaCodeInfo allowed to consume? Having less
* memory means more disk access and vice versa. The function does even work
* if you set this value to zero. For maximum performance, the function needs
* 4 byte per area code stored in the data file. The default is 32KB.
*/
unsigned long acMaxMem = 0x8000UL;
/*****************************************************************************/
/* Data and structures */
/*****************************************************************************/
/* Define an unsigned quantity with 32 bits. Try to make some clever
* assumptions using the data from limits.h. This may break some older
* (non ISO compliant) compilers, but I can't help...
*/
#if !defined(u32) && defined(ULONG_MAX)
# if ULONG_MAX == 4294967295UL
# define u32 unsigned long
# endif
#endif
#if !defined(u32) && defined(UINT_MAX)
# if UINT_MAX == 4294967295UL
# define u32 unsigned
# endif
#endif
#if !defined(u32) && defined(USHRT_MAX)
# if USHRT_MAX == 4294967295UL
# define u32 unsigned short
# endif
#endif
#if !defined(u32)
# define u32 unsigned long
#endif
/* The version of the data file we support (major only, minor is ignored) */
#define acVersion 0x200
/* The magic words in little and big endian format */
#define LittleMagic 0x35465768L
#define BigMagic 0x68574635L
/* Defining the byte ordering */
#define boLittleEndian 0
#define boBigEndian 1
/* The byte order used in the file is little endian (intel) format */
#define FileByteOrder boLittleEndian
/* Shortcuts */
#define scCount 30
/* This is the header data of the data file. It is not used anywhere in
* the code, just have a look at it since it describes the layout in the
* file.
*/
typedef struct {
u32 Magic;
u32 Version; /* Version in hi word, build in lo word */
u32 Count;
u32 AreaCodeStart;
u32 InfoIndexStart;
u32 InfoStart;
u32 MinLength;
u32 CodeDataSize;
u32 Shortcut [scCount]; /* Shortcuts for compression */
} PrefixHeader;
/* This is what's really used: */
typedef struct {
/* The file we read from */
FILE* F;
/* Machine byte order */
unsigned ByteOrder;
/* Stuff from the file header */
unsigned Version;
unsigned Build;
u32 Count;
u32 AreaCodeStart;
u32 InfoIndexStart;
u32 InfoStart;
u32 AreaCodeLenStart;
u32 MinLength; /* Minimum phone number length in data file */
u32 CodeDataSize;
u32 Shortcut [scCount]; /* Compression shortcuts */
/* Control data */
long First;
long Last;
u32* Table;
} AreaCodeDesc;
/* Translation table for translation ISO-8859-1 --> CP850. To save some space,
* the table covers only values > 127
*/
#ifdef CHARSET_CP850
static char CP850Map [128] = {
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0xAD, 0x9B, 0x9C, 0x20, 0x9D, 0x20, 0x20,
0x20, 0x20, 0xA6, 0xAE, 0xAA, 0x20, 0x20, 0x20,
0xF8, 0xF1, 0xFD, 0x20, 0x20, 0xE6, 0x20, 0xF9,
0x20, 0x20, 0xA7, 0xAF, 0xAC, 0x20, 0x20, 0xA8,
0x20, 0x20, 0x20, 0x20, 0x8E, 0x8F, 0x92, 0x80,
0x20, 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0xA5, 0x20, 0x20, 0x20, 0x20, 0x99, 0x20,
0x20, 0x20, 0x20, 0x20, 0x9A, 0x20, 0x20, 0xE1,
0x85, 0xA0, 0x83, 0x20, 0x84, 0x86, 0x91, 0x87,
0x8A, 0x82, 0x88, 0x89, 0x8D, 0xA1, 0x8C, 0x8B,
0x20, 0xA4, 0x95, 0xA2, 0x93, 0x20, 0x94, 0xF6,
0x20, 0x97, 0xA3, 0x20, 0x81, 0x20, 0xB0, 0x98
};
#endif
/* Macro to convert from big endian to little endian format and vice versa.
* Beware: The macro evaluates its parameter more than once!
*/
#define _ByteSwap(__V) ((((__V) & 0x000000FF) << 24) | \
(((__V) & 0xFF000000) >> 24) | \
(((__V) & 0x0000FF00) << 8) | \
(((__V) & 0x00FF0000) >> 8))
/*****************************************************************************/
/* Helper functions */
/*****************************************************************************/
static int IsShortcut (char C)
/* Return true if the given character is a shortcut */
{
return (C >= 0x01 && C <= 0x1F);
}
static u32 _ByteSwapIfNeeded (u32 D, unsigned ByteOrder)
/* Put the bytes into the correct order according to ByteOrder */
{
/* Swap bytes if needed and return the result */
switch (ByteOrder) {
case boLittleEndian: return D;
default: return _ByteSwap (D);
}
}
static u32 ByteSwapIfNeeded (u32 D, const AreaCodeDesc* Desc)
/* Put the bytes into the correct order according to ByteOrder in Desc */
{
/* Swap bytes if needed and return the result */
return _ByteSwapIfNeeded (D, Desc->ByteOrder);
}
static u32 _Load_u32 (FILE* F, unsigned ByteOrder)
/* Load an u32 from the current file position and swap it if needed */
{
u32 D;
/* Read the data from the file */
fread (&D, sizeof (D), 1, F);
/* Swap bytes if needed and return the result */
return _ByteSwapIfNeeded (D, ByteOrder);
}
static u32 _Load_u24 (FILE* F, unsigned ByteOrder)
/* Load an u24 from the current file position and swap it if needed */
{
u32 D = 0;
/* Read the data from the file */
fread (&D, 3, 1, F);
/* Swap bytes if needed and return the result */
return _ByteSwapIfNeeded (D, ByteOrder);
}
static u32 Load_u32 (const AreaCodeDesc* Desc)
/* Load an u32 from the current file position and swap it if needed */
{
return _Load_u32 (Desc->F, Desc->ByteOrder);
}
static u32 Load_u24 (const AreaCodeDesc* Desc)
/* Load an u32 from the current file position and swap it if needed */
{
return _Load_u24 (Desc->F, Desc->ByteOrder);
}
static unsigned LoadShortcut (const AreaCodeDesc* Desc, char* S,
unsigned Size, unsigned char C)
/* Read the string for shortcut C into S, return the count of bytes read */
{
unsigned CharsRead = 0;
/* Get the replacement string in u32 format */
u32 Shortcut = Desc->Shortcut [C-1];
/* Insert character by character, do recursively replace shortcuts. */
while (Shortcut && CharsRead < Size-1) {
char C = Shortcut & 0x00FF;
if (IsShortcut (C)) {
/* This is another shortcut, replace it recursively */
unsigned ReadCount = LoadShortcut (Desc, S, Size-CharsRead, C);
CharsRead += ReadCount;
S += ReadCount;
} else {
/* No shortcut, use the character itself */
*S++ = C;
CharsRead++;
}
Shortcut >>= 8;
}
/* Return the count of characters read */
return CharsRead;
}
static void LoadString (const AreaCodeDesc* Desc, char* S, unsigned Count)
/* Read a zero terminated string from the file */
{
unsigned CharsRead = 0;
/* String must hold at least one character plus terminator */
assert (Count >= 2);
/* Fill the string */
while (CharsRead < Count-1) {
/* Read the next character */
int C = getc (Desc->F);
/* If it is EOF or NUL, we're done */
if (C == EOF || C == 0) {
break;
}
/* Check if we have a real character or a shortcut */
if (IsShortcut (C)) {
/* Insert a shortcut string */
unsigned ReadCount = LoadShortcut (Desc, S, Count-CharsRead, (char)C);
CharsRead += ReadCount;
S += ReadCount;
} else {
/* Normal character */
*S++ = (char) C;
CharsRead++;
}
}
/* Set the terminating zero */
*S = '\0';
}
static unsigned LoadFileHeader (AreaCodeDesc* Desc)
/* Load the header of a data file. Return one of the acXXX codes. */
{
int I;
u32 Version;
/* Load the magic word in the format used int the file (do not convert) */
u32 Magic = _Load_u32 (Desc->F, FileByteOrder);
/* Check what we got from the file, determine the byte order */
switch (Magic) {
case BigMagic:
Desc->ByteOrder = boBigEndian;
break;
case LittleMagic:
Desc->ByteOrder = boLittleEndian;
break;
default:
/* OOPS - the file is probably not a valid data file */
return acInvalidFile;
}
/* Now read the rest of the header data */
Version = Load_u32 (Desc);
Desc->Version = (Version >> 16);
Desc->Build = (Version & 0xFFFF);
Desc->Count = Load_u32 (Desc);
Desc->AreaCodeStart = Load_u32 (Desc);
Desc->InfoIndexStart = Load_u32 (Desc);
Desc->InfoStart = Load_u32 (Desc);
Desc->MinLength = Load_u32 (Desc);
Desc->CodeDataSize = Load_u32 (Desc);
/* Read the shortcuts */
for (I = 0; I < scCount; I++) {
Desc->Shortcut [I] = Load_u32 (Desc);
}
/* Check for some error conditions */
if (ferror (Desc->F)) {
/* Some sort of file problem */
return acFileError;
} else if (feof (Desc->F) || Desc->Count == 0) {
/* This should not happen on a valid file */
return acInvalidFile;
} else if ((Desc->Version & 0xFF00) != acVersion) {
return acWrongVersion;
} else {
/* Data is sane */
return acOk;
}
}
static u32 EncodeNumber (const char* Phone)
/* Encode the number we got from the caller into the internally used BCD
* format. If there are invalid chars in the number, return 0xFFFFFFFF.
*/
{
unsigned I;
unsigned Len;
u32 P = 0; /* Initialize to make gcc happy */
/* Get the amount of characters to convert */
Len = strlen (Phone);
if (Len == 0) {
/* Invalid */
return 0xFFFFFFFF;
} else if (Len > 8) {
Len = 8;
}
/* Convert the characters */
for (I = 0; I < Len; I++) {
/* Get the next character and check if it's valid */
char C = Phone [I];
if (!isascii (C) || !isdigit (C)) {
/* Invalid digit */
return 0xFFFFFFFF;
}
P = (P << 4) | (C & 0x0F);
}
/* Fill the rest of the number with 0x0F */
I = 8 - Len;
while (I--) {
P = (P << 4) | 0x0F;
}
/* Done - return the result */
return P;
}
static u32 ReadPhone (const AreaCodeDesc* Desc, long Index)
/* Read the phone number that is located at the given index. If we have a
* part of the table already loaded into memory, use the memory copy, else
* read the phone number from disk.
*/
{
if (Desc->Table && Index >= Desc->First && Index <= Desc->Last) {
/* Use the already loaded table, but don't forget to swap bytes */
return ByteSwapIfNeeded (Desc->Table [Index - Desc->First], Desc);
} else {
/* Load the value from the file */
fseek (Desc->F, Desc->AreaCodeStart + Index * sizeof (u32), SEEK_SET);
return Load_u32 (Desc);
}
}
static void LoadTable (AreaCodeDesc* Desc)
/* Load a part of the table into memory */
{
u32 SpaceNeeded = (Desc->Last - Desc->First + 1) * sizeof (u32);
Desc->Table = malloc (SpaceNeeded);
if (Desc->Table == 0) {
/* Out of memory. There is no problem with this now since we do
* not really need the table in core memory (it speeds things up,
* that's all). In addition to that, the memory requirement halves
* with each iteration, so maybe we have more luck next time.
*/
return;
}
/* Seek to the correct position in the file */
fseek (Desc->F, Desc->AreaCodeStart + Desc->First * sizeof (u32), SEEK_SET);
/* Read the data */
fread (Desc->Table, SpaceNeeded, 1, Desc->F);
}
static unsigned char CalcCodeLen (u32 Code)
/* Calculate the length of a given (encoded) area code in characters */
{
u32 Mask;
unsigned char Len = 0;
for (Mask = 0xF0000000L; Mask; Mask >>= 4) {
if ((Code & Mask) != Mask) {
Len++;
} else {
break;
}
}
return Len;
}
static unsigned CalcMatchingDigits (u32 Code1, u32 Code2)
/* Return the count of digits that match when comparing both numbers from
* left to right.
*/
{
static const u32 Masks [9] = {
0x00000000, 0xF0000000, 0xFF000000, 0xFFF00000,
0xFFFF0000, 0xFFFFF000, 0xFFFFFF00, 0xFFFFFFF0,
0xFFFFFFFF
};
unsigned I = sizeof (Masks) / sizeof (Masks [0]) - 1;
while ((Code1 & Masks [I]) != (Code2 & Masks [I])) {
/* Next one */
I--;
}
/* Return the count of matching digits */
return I;
}
/*****************************************************************************/
/* Code */
/*****************************************************************************/
unsigned GetAreaCodeInfo (acInfo* AC, const char* PhoneNumber)
/* Return - if possible - an information for the area code of the given number.
* The function returns one of the error codes defined in areacode.h. If the
* returned value is acOk, the AC struct is filled with the data of the
* area code found. If we did not have an error, but there is no area code
* that corresponds to the given number, the function returns acOk, but the
* AC struct is filled with an empty Info field and a AreaCodeLen of zero.
*/
{
u32 Phone; /* PhoneNumber encoded in BCD */
long First, Last, Current; /* For binary search */
u32 CurrentVal; /* The value at Table [Current] */
int Found; /* Flag: We've found an exact match */
unsigned RC = acOk; /* Result code of the function */
unsigned char AreaCodeLen; /* Length of areacode found */
u32 InfoStart; /* Starting offset of info string */
AreaCodeDesc Desc;
/* Clear the fields of the AC struct. Write a zero to the last field of
* Info - this field is never written to by the rest of the code. So by
* setting this to zero, we will assure a terminated string in case some
* problem prevents the code below from executing correctly.
*/
AC->Info [0] = '\0';
AC->Info [sizeof (AC->Info) - 1] = '\0';
AC->AreaCodeLen = 0;
/* Convert the phone number into the internal representation. If the
* number is invalid, return immidiately. This will also catch an empty
* phone number, so the rest of the code may safely assume that phone
* has a value that makes sense.
*/
Phone = EncodeNumber (PhoneNumber);
if (Phone == 0xFFFFFFFF) {
/* Invalid number */
return acInvalidInput;
}
/* Open the database file, check for errors */
Desc.F = fopen (acFileName, "rb");
if (Desc.F == 0) {
/* We had an error opening the file */
return acFileError;
}
/* Initialize descriptor data where needed */
Desc.Table = 0;
/* Read the header from the file */
RC = LoadFileHeader (&Desc);
if (RC != acOk) {
/* Wrong file or file read error */
goto ExitWithClose;
}
/* Add dead code to work around gcc warnings */
Current = 0;
CurrentVal = 0;
/* Now do a (eventually repeated) binary search over the data */
Found = 0;
First = 0;
do {
Last = (long) Desc.Count - 1;
while (First <= Last && Found == 0) {
/* If we don't have read the table into memory, check if we can do
* so now.
*/
if (Desc.Table == 0) {
u32 NeedMemory = (Last - First + 1) * sizeof (u32);
if (NeedMemory <= acMaxMem) {
/* Ok, the current part of the table is now small enough to
* load it into memory.
*/
Desc.First = First;
Desc.Last = Last;
LoadTable (&Desc);
}
}
/* Set current to mid of range */
Current = (Last + First) / 2;
/* Get the phone number from that place */
CurrentVal = ReadPhone (&Desc, Current);
/* Do a compare */
if (Phone > CurrentVal) {
First = Current + 1;
} else {
Last = Current - 1;
if (Phone == CurrentVal) {
/* Exact match (whow!) - terminate the loop */
Found = 1;
}
}
}
/* If we don't have an exact match, we check for a partially one. If
* Found is not true, the loop above will terminate with First > Last.
* Beware: This means that the index is eventually invalid!
*/
if (Found == 0) {
unsigned MatchingDigits; /* Count of matching digits */
/* Set the new current index and check if it is valid */
Current = First;
if (Current >= (long) Desc.Count) {
/* Not found */
goto ExitWithClose;
}
/* The index is valid, load the value */
CurrentVal = ReadPhone (&Desc, Current);
/* Calculate the length of the area code */
AreaCodeLen = CalcCodeLen (CurrentVal);
assert (AreaCodeLen > 0);
/* Check if the prefix is actually the first part of the phone
* number. If so, we've found a match.
*/
MatchingDigits = CalcMatchingDigits (Phone, CurrentVal);
if (MatchingDigits >= AreaCodeLen) {
/* Match! */
Found = 1;
} else {
/* Ok, now comes the tricky part: Since we allow numbers that
* do completely contain other numbers (e.g. 0123 --> AAAA,
* 01239 --> BBBB), we may have found the longer number and
* this caused the mismatch. There maybe a match if we remove
* one or more digits from the number.
* Because empty digits are filled with hex 'F', the shorter
* number (if one exists) comes *after* the one we found
* already. If there is a shorter number, it has - as a
* maximum - as many digits as were matching between the
* number we searched for, and the one we found. Since we have
* the length of the shortest number, contained in the data
* file in the header, we can stop, if the matching digits get
* below or equal to this value.
*/
if (MatchingDigits < Desc.MinLength) {
/* No match! */
goto ExitWithClose;
}
/* Replace all non-matching digits by hex 'F', and start again
* searching, beginning after the current value.
*/
Phone |= 0xFFFFFFFF >> (MatchingDigits * 4);
First = Current + 1;
}
}
} while (Found == 0);
/* Ok, we have now definitely found the code. Current is the index of the
* area code. Seek to the corresponding position in the name index, get
* the name position and the area code length from there (which is encoded
* together with the offset in a three byte value). To be more compatible
* with future versions, there is a field in the header that says, how big
* the area code specific data at that place is.
*/
fseek (Desc.F, Desc.InfoIndexStart + Current * Desc.CodeDataSize, SEEK_SET);
InfoStart = Load_u24 (&Desc);
/* The real area code length is in bit 20-23 of the value just read */
AC->AreaCodeLen = (unsigned) ((InfoStart & 0xF00000) >> 20) + 1;
/* Seek to the start of the info */
fseek (Desc.F, Desc.InfoStart + (InfoStart & 0xFFFFF), SEEK_SET);
/* Zero terminated info string follows. */
LoadString (&Desc, AC->Info, sizeof (AC->Info));
#ifdef CHARSET_CP850
/* Translate the info to the CP850 charset */
{
unsigned char *S = (unsigned char*) AC->Info;
while (*S) {
if (*S >= 128) {
*S = CP850Map [*S - 128];
}
S++;
}
}
#endif
ExitWithClose:
/* Free the table memory if allocated */
free (Desc.Table);
/* Close the data file */
fclose (Desc.F);
/* Done, return the result */
return RC;
}