SimString - A fast and efficient implementation for approximate string matching

Documentation

Sample Programs

A basic sample.

#include <iostream>
#include <string>
#include <simstring/simstring.h>

void retrieve(
    simstring::reader& dbr,
    const std::string& query,
    int measure,
    double threshold
    )
{
    // Retrieve similar strings into a string vector.
    std::vector<std::string> xstrs;
    dbr.retrieve(query, measure, threshold, std::back_inserter(xstrs));

    // Output the retrieved strings separated by ", ".
    for (int i = 0;i < (int)xstrs.size();++i) {
        std::cout << (i != 0 ? ", " : "") << xstrs[i];
    }
    std::cout << std::endl;
}

int main(int argc, char *argv[])
{
    // Create a SimString database with two person names.
    simstring::ngram_generator gen(3, false);
    simstring::writer_base<std::string> dbw(gen, "sample.db");

    dbw.insert("Barack Hussein Obama II");
    dbw.insert("James Gordon Brown");
    dbw.close();

    // Open the database for reading.
    simstring::reader dbr;
    
    dbr.open("sample.db");
    retrieve(dbr, "Barack Obama", simstring::cosine, 0.6);
    retrieve(dbr, "Gordon Brown", simstring::cosine, 0.6);
    retrieve(dbr, "Obama", simstring::cosine, 0.6);
    retrieve(dbr, "Obama", simstring::overlap, 1.0);

    return 0;
}

A Unicode sample.

#include <iostream>
#include <locale>
#include <string>
#include <simstring/simstring.h>

void retrieve(
    simstring::reader& dbr,
    const std::wstring& query,
    int measure,
    double threshold
    )
{
    // Retrieve similar strings into a string vector.
    std::vector<std::wstring> xstrs;
    dbr.retrieve(query, measure, threshold, std::back_inserter(xstrs));

    // Output the retrieved strings separated by ", ".
    for (int i = 0;i < (int)xstrs.size();++i) {
        std::wcout << (i != 0 ? L", " : L"") << xstrs[i];
    }
    std::wcout << std::endl;
}

int main(int argc, char *argv[])
{
    // Activate std::wcout.
    std::locale::global(std::locale("")); 
    std::wcout.imbue(std::locale(""));

    // Open a SimString database for writing (with std::wstring).
    simstring::ngram_generator gen(3, false);
    simstring::writer_base<std::wstring> dbw(gen, "sample_unicode.db");
    dbw.insert(L"スパゲティ");
    dbw.close();

    // Open the database for reading.
    simstring::reader dbr;
    dbr.open("sample_unicode.db");

    // Output similar strings from Unicode queries.
    retrieve(dbr, L"スパゲティー", simstring::cosine, 0.6);

    return 0;
}

Copyright (c) 2002-2010 by Naoaki Okazaki
Sun Mar 7 18:17:17 2010