                       Fuzzy String Comparisons

                            Matthew Probert
                           Servile  Software


Boolean logic dictates that an item X is either the same as another item, 
Y or is not the same. Great for mathematics, but not applicable to 
the real world. Fuzzy logic dictates that an item Z is the same as 
another item Y, or is different to another item Y or is similar to 
another item Y.

For example, consider the name "MATTHEW PROBERT". Compare this with 
the alternative spelling of Matthew, and you get "MATHEW PROBERT". You 
and I can see that the two strings; "MATTHEW PROBERT" and "MATHEW 
PROBERT" are almost the same. Boolean logic would argue that they are 
different. Just different, as is "BLACK" and "WHITE". However, if we 
are searching a database for the name "MATTHEW PROBERT" we often would 
like to be offered similar matching records. 

Traditionaly the system used for this type of fuzzy matching was the 
soundex algorithm which converts strings into a code based upon 
phonetic rules. The idea being that words which sound the same will 
return the same code when processed by a soundex function and so a 
search may be made on the sound of a word, rather than a boolean 
match.

An example implementation of a soundex algorithm:

-----------------------------Cut here-----------------------------

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#define MAX_OMITS   9   /* characters to be ignored */
#define MAX_GROUPS  6   /* +1 for NULL pointer in static declare/init */

#define MAX_DIGITS  4   /*
                            number of digits in code sequence
                            By increasing or decreasing this the
                            scope of the soundex code is changed
                        */

#define CODE_ALLOC  MAX_DIGITS + 1   /* size of code sequence */

static char omit_letter[MAX_OMITS]   = " AEHIOUWY";
static char *code_group[MAX_GROUPS + 1] =  {
                                            "BFPV",
                                            "CGJKQSXZ",
                                            "DT",
                                            "L",
                                            "MN",
                                            "R",
                                            NULL
                                        };

int name_2_code(char *, int, char *);
char translate(char);

char *SOUNDEX(char *s)
{
    int error;
    int name_size;
    static char code[5];

    error = (!isalpha(*s));

    if (!error)
    {
        name_size = (int)(strlen(s) + 1);   /* +1 for NIL char */

        /* make uppercase */
        strupr(s);

        error = name_2_code(s, name_size, code);
    }

    if (!error)
        return(code);
    else
        return("");
}

int name_2_code(char *source, int source_size, char *target)
{
    /*
        Convert a word into a unit code (alpha, digit1, digit2...digitn)
        as per the SOUNDEX method rules.
    */

    int error;
    int i;
    int j;

    error = 0;

    /* The first character is not translated */
    target[0] = source[0];
    i = 1;
    j = 1;

    /* copy while filtering unwanted characters */
    while (i < (source_size - 1) && !error)
    {
        if ((target[j - 1] != source[i]) &&
             strchr(omit_letter,source[i]) == NULL)
        {
            error = !isalpha(source[i]);

            if (!error)
            {
                target[j] = source[i];
                j++;
            }
        }

        i++;
    }

    /* truncate string */
    if (j > MAX_DIGITS)
        j = MAX_DIGITS;

    target[j] = '\0';

    if (!error)
    {
        /* translation */
        for (i = 1; i < j; i++)
            target[i] = translate(target[i]);

        /* zero fill */
        for (i = j; i < MAX_DIGITS; i++)
            target[i] = '0';

        target[i] = '\0';
    }

    return (error);
}



char translate(char chr)
{
    /*
        Translate a character into a code digit.
    */

    char found;
    char digit;
    int i;
    int j;

    found = 0;

    i = 0;
    j = 0;

    digit = '\0';

    /* scan groups until match is found */
    while (i < MAX_GROUPS && !found)
    {
        /* scan the list of character in this group */
        j = 0;
        while (j < strlen(code_group[i]) && !found)
        {
            found = (chr == code_group[i][j]);
            j++;
        }

        if (!found)
            i++;
    }

    digit = (i + 1) + 48;   /* convert the subscript to asci */

    return (digit);
}

/*
    Demonstration code
*/


main()
{
    char text[80];
    char *p;

    printf("\nDemonstration of the Soundex algorithm\nType in words or 
Ctrl Z to end\n\n"); 

    for(;;)
    {
        p = fgets(text,79,stdin);
        if (p)
        {
            p[strlen(p)-1] = 0;
            printf("\n%s   %s\n",p,SOUNDEX(p));
        }
        else
            break;
    }
}
-----------------------------Cut here-----------------------------

The soundex algorithm is all well and good for spoken language, but a 
common problem with data entered through a keyboard is not so much 
mispelling as miskeying. The percentage comparison function below is 
better suited to comparing strings when typed at a keyboard. It 
returns the degree to which two strings are the same:

By replacing the data type float with int, the function will execute 
faster, but at reduced accuracy. How accurate a comparison you require 
will vary from one application to another.

-----------------------------Cut here-----------------------------
/*
        Fuzzy logic string comparison 
        Designed by Matthew Probert, Servile Software, 1989
*/

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

float MATCHSTR(char *src, char *tgt)
{
    /* Compare two strings in a forward order */
    /* and return a percentage match */

    float match;
    float result;
    float strsrc;
    float strtgt;

    strtgt = strlen(strupr(tgt));
    strsrc = strlen(strupr(src));

    result = (strsrc == strtgt);

    match = 0;

    if(strtgt > strsrc)
    {
        for(; *src ; match += (*src++ == *tgt++))
            ;
    }
    else
    {
        for(; *tgt ; match += (*src++ == *tgt++))
            ;
        strtgt = strsrc;
    }

    return( (match == 0) ? result : 100/(strtgt/match) - (1 - result));
}

float MATCHREV(char *src, char *tgt)
{
    /* Compare two strings in a reverse order */
    /* and return a percentage match */

    float result;
    float match;
    char *x;
    char *y;
    float strsrc;
    float strtgt;

    strsrc = strlen(src);
    strtgt = strlen(tgt);

    match = 0;

    result = (strsrc == strtgt);

    x = malloc(strlen(src));
    y = malloc(strlen(tgt));

    strcpy(x,src);
    strcpy(y,tgt);

    src = strrev(x);
    tgt = strrev(y);

    if(strtgt > strsrc)
    {
        for(; *src ; match += (*src++ == *tgt++))
            ;
    }
    else
    {
        for(; *tgt ; match += (*src++ == *tgt++))
            ;
        strtgt = strsrc;
    }

    free(x);
    free(y);

    return( (match == 0) ? result : 100/(strtgt/match)  - (1 - result));
}

float COMPARE(char *s1, char *s2)
{
    float p1;
    float p2;

    p1 = MATCHSTR(s1,s2);
    p2 = MATCHREV(s1,s2);
    return((p1 > p2) ? p1:p2);
}

void main()
{
    char word_1[80];
    char word_2[80];
    char *p;

    for(;;)
    {
        printf("\nDemonstration of the percentage string comparison 
system\nEnter two strings or Ctrl Z to finish\n\n"); 
        p = fgets(word_1,70,stdin);
        if (p == NULL)
            break;
        p = fgets(word_2,70,stdin);
        if (p == NULL)
            break;
        word_1[strlen(word_1) - 1] = 0;
        word_2[strlen(word_2) - 1] = 0;

        printf("\n%s %s %.0f%%\n",word_1,word_2,COMPARE(word_1,word_2));
    }
}
-----------------------------Cut here-----------------------------

