hp5 dict

Giganews Newsgroups
Subject: hp5 dict
Posted by:  Alexandru Mosoi (alexandru.mos…@gmail.com)
Date: 8 Feb 2007

I found a way to improve the compression of paq8hp5 dictionary.

Let's take the following 3 entries from the dictionary:
documentary
elementary
parliamentary

since they all three end in "mentary" mentary can be substituted by
another symbol, let's say " " (space).
The simple algorithm is as follows: if current word ends in the common
suffix of the last two words, subtitute current word's suffix with
space.

the results (compressed with rar a -m5):
hp5.dic  - 411681 - 147821
hp5.parsed - 322501 - 121045
hp7.dic - 420139 - 153068

Here is the source code (very very ugly written, but understandable):

// released to public domain
// author: Alexandru Mosoi

#include <string>
#include <cstdio>

using namespace std;

void unparse() {
    FILE* fi, *fo;

    fi = fopen("hp5.parsed", "r");
    fo = fopen("hp5.unparsed", "w");

    string last0, last1;

    char buffer[128];
    while (fgets(buffer, 128, fi) != NULL) {
        if (buffer[strlen(buffer)-1] == '
')
            buffer[strlen(buffer)-1] = 0;

        int j = 0;
        while (j < last0.size() && j < last1.size() && last0[last0.size()-1-
j] == last1[last1.size()-1-j])
            ++j;

        string curr(buffer);

        if (curr[curr.size()-1] == ' ') {
            curr.erase(curr.size()-1);
            curr.append(last0.substr(last0.size()-j));
        }

        fprintf(fo, "%s
", curr.c_str());

        last0 = last1;
        last1 = curr;
    }

    fclose(fo);
    fclose(fi);
}

void parse() {
    FILE* fi, *fo;

    fi = fopen("hp5.dic", "r");
    fo = fopen("hp5.parsed", "w");

    string last0, last1;

    char buffer[128];
    while (fgets(buffer, 128, fi) != NULL) {
        if (buffer[strlen(buffer)-1] == '
')
            buffer[strlen(buffer)-1] = 0;

        int j = 0;
        while (j < last0.size() && j < last1.size() && last0[last0.size()-1-
j] == last1[last1.size()-1-j])
            ++j;

        string curr(buffer);

        if (j && curr.size() >= j && curr.substr(curr.size()-j) ==
last0.substr(last0.size()-j)) {
            fprintf(fo, "%s
", curr.substr(0, curr.size()-j).c_str());
        } else {
            fprintf(fo, "%s
", curr.c_str());
        }

        last0 = last1;
        last1 = curr;
    }

    fclose(fo);
    fclose(fi);

}

int main() {
    parse();
    unparse();
    return 0;
}

Replies