Skip to content

Commit

Permalink
Merge the spelling corrector code and make apropos use it.
Browse files Browse the repository at this point in the history
  • Loading branch information
abhinav-upadhyay committed Mar 11, 2012
1 parent 5642680 commit 8651edc
Show file tree
Hide file tree
Showing 3 changed files with 297 additions and 11 deletions.
274 changes: 274 additions & 0 deletions apropos-utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ __RCSID("$NetBSD: apropos-utils.c,v 1.2 2012/02/07 19:17:16 joerg Exp $");
#include "mandoc.h"
#include "sqlite3.h"

#define BUFLEN 1024

typedef struct orig_callback_data {
void *data;
int (*callback) (void *, const char *, const char *, const char *,
Expand All @@ -60,6 +62,11 @@ typedef struct inverse_document_frequency {
int status;
} inverse_document_frequency;

typedef struct set {
char *a;
char *b;
} set;

/* weights for individual columns */
static const double col_weights[] = {
2.0, // NAME
Expand Down Expand Up @@ -352,6 +359,273 @@ init_db(int db_flag)
return NULL;
}


/*
* Following is an implmentation of a spell corrector based on Peter Norvig's
* article: <http://norvig.com/spell-correct.html>. This C implementation is
* written completely by me from scratch.
*/

/*
* edits1--
* edits1 generates all permutations of a given word at maximum edit distance
* of 1. All details are in the above article but basically it generates 4
* types of possible permutations in a given word, stores them in an array and
* at the end returns that array to the caller. The 4 different permutations
* are: (n = strlen(word) in the following description)
* 1. Deletes: Delete one character at a time: n possible permutations
* 2. Trasnposes: Change positions of two adjacent characters: n -1 permutations
* 3. Replaces: Replace each character by one of the 26 alphabetes in English:
* 26 * n possible permutations
* 4. Inserts: Insert an alphabet at each of the character positions (one at a
* time. 26 * (n + 1) possible permutations.
*/
static char **
edits1 (char *word)
{
int i;
int len_a;
int len_b;
int counter = 0;
char alphabet;
int n = strlen(word);
set splits[n + 1];

/* calculate number of possible permutations and allocate memory */
size_t size = n + n -1 + 26 * n + 26 * (n + 1);
char **candidates = emalloc (size * sizeof(char *));

/* Start by generating a split up of the characters in the word */
for (i = 0; i < n + 1; i++) {
splits[i].a = (char *) emalloc(i + 1);
splits[i].b = (char *) emalloc(n - i + 1);
memcpy(splits[i].a, word, i);
memcpy(splits[i].b, word + i, n - i + 1);
splits[i].a[i] = 0;
}

/* Now generate all the permutations at maximum edit distance of 1.
* counter keeps track of the current index position in the array candidates
* where the next permutation needs to be stored.
*/
for (i = 0; i < n + 1; i++) {
len_a = strlen(splits[i].a);
len_b = strlen(splits[i].b);
assert(len_a + len_b == n);

/* Deletes */
if (i < n) {
candidates[counter] = emalloc(n);
memcpy(candidates[counter], splits[i].a, len_a);
if (len_b -1 > 0)
memcpy(candidates[counter] + len_a , splits[i].b + 1, len_b - 1);
candidates[counter][n - 1] =0;
counter++;
}

/* Transposes */
if (i < n - 1) {
candidates[counter] = emalloc(n + 1);
memcpy(candidates[counter], splits[i].a, len_a);
if (len_b >= 1)
memcpy(candidates[counter] + len_a, splits[i].b + 1, 1);
if (len_b >= 1)
memcpy(candidates[counter] + len_a + 1, splits[i].b, 1);
if (len_b >= 2)
memcpy(candidates[counter] + len_a + 2, splits[i].b + 2, len_b - 2);
candidates[counter][n] = 0;
counter++;
}

/* For replaces and inserts, run a loop from 'a' to 'z' */
for (alphabet = 'a'; alphabet <= 'z'; alphabet++) {
/* Replaces */
if (i < n) {
candidates[counter] = emalloc(n + 1);
memcpy(candidates[counter], splits[i].a, len_a);
memcpy(candidates[counter] + len_a, &alphabet, 1);
if (len_b - 1 >= 1)
memcpy(candidates[counter] + len_a + 1, splits[i].b + 1, len_b - 1);
candidates[counter][n] = 0;
counter++;
}

/* Inserts */
candidates[counter] = emalloc(n + 2);
memcpy(candidates[counter], splits[i].a, len_a);
memcpy(candidates[counter] + len_a, &alphabet, 1);
if (len_b >=1)
memcpy(candidates[counter] + len_a + 1, splits[i].b, len_b);
candidates[counter][n + 1] = 0;
counter++;
}
}
return candidates;
}

/*
* known_word--
* Pass an array of strings to this function and it will return the word with
* maximum frequency in the dictionary. If no word in the array list is found
* in the dictionary, it returns NULL
* #TODO rename this function
*/
static char *
known_word(sqlite3 *db, char **list, int n)
{
int i, rc;
char *sqlstr;
char *termlist = NULL;
char *correct = NULL;
sqlite3_stmt *stmt;

/* Build termlist: a comma separated list of all the words in the list for
* use in the SQL query later.
*/
int total_len = BUFLEN * 20; /* total bytes allocated to termlist */
termlist = emalloc(total_len);
int offset = 0; /* Next byte to write at in termlist */
termlist[0] = '(';
offset++;

for (i = 0; i < n; i++) {
int d = strlen(list[i]);
if (total_len - offset < d + 3) {
termlist = erealloc(termlist, offset + total_len);
total_len *= 2;
}
memcpy(termlist + offset, "\'", 1);
offset++;
memcpy(termlist + offset, list[i], d);
offset += d;

if (i == n -1) {
memcpy(termlist + offset, "\'", 1);
offset++;
}
else {
memcpy(termlist + offset, "\',", 2);
offset += 2;
}

}
if (total_len - offset > 3)
memcpy(termlist + offset, ")", 2);
else
concat2(&termlist, ")", 1);

easprintf(&sqlstr, "SELECT word FROM mandb_dict WHERE "
"frequency = (SELECT MAX(frequency) FROM mandb_dict "
"WHERE word IN %s) AND word IN %s", termlist, termlist);
rc = sqlite3_prepare_v2(db, sqlstr, -1, &stmt, NULL);
if (rc != SQLITE_OK) {
warnx("%s", sqlite3_errmsg(db));
return NULL;
}

if (sqlite3_step(stmt) == SQLITE_ROW)
correct = strdup((char *) sqlite3_column_text(stmt, 0));

sqlite3_finalize(stmt);
free(sqlstr);
free(termlist);
return (correct);
}

static void
free_list(char **list, int n)
{
int i = 0;
if (list == NULL)
return;

while (i < n) {
free(list[i]);
i++;
}
}

/*
* spell--
* The API exposed to the user. Returns the most closely matched word from the
* dictionary. It will first search for all possible words at distance 1, if no
* matches are found, it goes further and tries to look for words at edit
* distance 2 as well. If no matches are found at all, it returns NULL.
*/
char *
spell(sqlite3 *db, char *word)
{
int i;
char *correct;
char **candidates;
int count2;
char **cand2 = NULL;
char *errmsg;
const char *sqlstr;
int n;
int count;
/*sqlite3_exec(db, "ATTACH DATABASE \':memory:\' AS metadb", NULL, NULL,
&errmsg);
if (errmsg != NULL) {
warnx("%s", errmsg);
free(errmsg);
close_db(db);
exit(EXIT_FAILURE);
}
sqlstr = "CREATE TABLE metadb.dict AS SELECT term, occurrences FROM "
"mandb_aux WHERE col=\'*\' ;"
"CREATE UNIQUE INDEX IF NOT EXISTS metadb.index_term ON "
"dict (term)";
sqlite3_exec(db, sqlstr, NULL, NULL, &errmsg);
if (errmsg != NULL) {
warnx("%s", errmsg);
free(errmsg);
return NULL;
}*/

lower(word);
correct = known_word(db, &word, 1);

if (!correct) {
n = strlen(word);
count = n + n -1 + 26 * n + 26 * (n + 1);
candidates = edits1(word);
correct = known_word(db, candidates, count);
/* No matches found ? Let's go further and find matches at edit distance 2.
* To make the search fast we use a heuristic. Take one word at a time from
* candidates, generate it's permutations and look if a match is found.
* If a match is found, exit the loop. Works reasonably fast but accuracy
* is not quite there in some cases.
*/
if (correct == NULL) {
for (i = 0; i < count; i++) {
n = strlen(candidates[i]);
count2 = n + n - 1 + 26 * n + 26 * (n + 1);
cand2 = edits1(candidates[i]);
if ((correct = known_word(db, cand2, count2)))
break;
else {
free_list(cand2, count2);
cand2 = NULL;
}
}
}
free_list(candidates, count);
free_list(cand2, count2);
}

/*sqlite3_exec(db, "DETACH DATABASE metadb", NULL, NULL,
&errmsg);
if (errmsg != NULL) {
warnx("%s", errmsg);
free(errmsg);
}*/
return correct;
}


/*
* rank_func --
* Sqlite user defined function for ranking the documents.
Expand Down
1 change: 1 addition & 0 deletions apropos-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,5 @@ void close_db(sqlite3 *);
int run_query(sqlite3 *, const char *[3], query_args *);
int run_query_html(sqlite3 *, query_args *);
int run_query_pager(sqlite3 *, query_args *);
char *spell(sqlite3*, char *);
#endif
33 changes: 22 additions & 11 deletions apropos.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ main(int argc, char *argv[])
char *errmsg = NULL;
char *str;
int ch, rc = 0;
char *correct_query;
char *correct;
int s;
callback_data cbdata;
cbdata.out = stdout; // the default output stream
Expand Down Expand Up @@ -151,7 +153,7 @@ main(int argc, char *argv[])
if (query == NULL)
errx(EXIT_FAILURE, "Try using more relevant keywords");

if ((db = init_db(MANDB_READONLY)) == NULL)
if ((db = init_db(MANDB_WRITE)) == NULL)
exit(EXIT_FAILURE);

/* If user wants to page the output, then set some settings */
Expand Down Expand Up @@ -181,24 +183,33 @@ main(int argc, char *argv[])
rc = run_query_pager(db, &args);
#endif

free(query);
close_db(db);
if (errmsg) {
if (errmsg || rc < 0) {
warnx("%s", errmsg);
free(errmsg);
free(query);
close_db(db);
exit(EXIT_FAILURE);
}

if (rc < 0) {
/* Something wrong with the database. Exit */
exit(EXIT_FAILURE);
}

char *orig_query = query;
char *term;
if (cbdata.count == 0) {
warnx("No relevant results obtained.\n"
correct_query = NULL;
for (term = strtok(query, " "); term; term = strtok(NULL, " ")) {
if ((correct = spell(db, term)))
concat(&correct_query, correct);
else
concat(&correct_query, term);
}

printf("Did you mean %s ?\n", correct_query);
/* warnx("No relevant results obtained.\n"
"Please make sure that you spelled all the terms correctly "
"or try using better keywords.");
"or try using better keywords.");*/
free(correct_query);
}
free(orig_query);
close_db(db);
return 0;
}

Expand Down

0 comments on commit 8651edc

Please sign in to comment.