Re[2]: [Snowball-discuss] an inconsistency with Russian stemmer

From: Andrew Aksyonoff (shodan@chat.ru)
Date: Fri Nov 16 2001 - 13:04:17 GMT


Hello Martin!

Friday, November 16, 2001, 12:50:45 PM, you wrote:
MP> Andrew, I'm posting this to the discussion list and copying the email to you.
Thanks for the fast reply. I'll do the same.

MP> There is of course no need to implement the stemmer in C, since a stemmer in
MP> C, generated from the Snowball script, is provided on the site. Were you not
MP> aware of that, or did you prefer to develop one yourself?
I was, but I decided to implement my own one - due to the
algorithm being very simple, it seemeed easier than even linking
with provided sources. There was other reasons, too (to get
acquainted with the stemming algorithms, to try improving
this one, to test implementations for speed, etc).

MP> In any case, I
MP> cannot see how to fix your problem from the information you have sent me.
MP> You would need to send me the sources of the program you have developed.
Surely. Here goes (in KOI-8, if this is unreadable, I'll
send you a version where all Russian letters are replaced
with hex numbers):

--- cut ---
#include <stdio.h>

typedef struct {
        unsigned char suffix[8];
        int remove;
} stem_table;

static unsigned char stem_ru_letters[] = "абвгдежзийклмнопрстуфхцчшщъыьэюя";
static unsigned char stem_ru_vowels[] = "аеиоуыэюя";

static stem_table ru_gerund[] = {
        { "ав", 1 },
        { "авши", 3 },
        { "авшись", 5 },

        { "яв", 1 },
        { "явши", 3 },
        { "явшись", 5 },

        { "ив", 2 },
        { "ивши", 4 },
        { "ившись", 6 },

        { "ыв", 2 },
        { "ывши", 4 },
        { "ывшись", 6 },
};

static stem_table ru_adj[] = {
        { "ее", 2 },
        { "ие", 2 },
        { "ые", 2 },
        { "ое", 2 },
        { "ими", 3 },
        { "ыми", 3 },
        { "ей", 2 },
        { "ий", 2 },
        { "ый", 2 },
        { "ой", 2 },
        { "ем", 2 },
        { "им", 2 },
        { "ым", 2 },
        { "ом", 2 },
        { "его", 3 },
        { "ого", 3 },
        { "ему", 3 },
        { "ому", 3 },
        { "их", 2 },
        { "ых", 2 },
        { "ую", 2 },
        { "юю", 2 },
        { "ая", 2 },
        { "яя", 2 },
        { "ою", 2 },
        { "ею", 2 }
};

static stem_table ru_part[] = {
        { "аем", 2 },
        { "анн", 2 },
        { "авш", 2 },
        { "ающ", 2 },
        { "ащ", 1 },
        { "яем", 2 },
        { "янн", 2 },
        { "явш", 2 },
        { "яющ", 2 },
        { "ящ", 1 },
        { "ивш", 3 },
        { "ывш", 3 },
        { "ующ", 3 }
};

static stem_table ru_reflex[] = {
        { "ся", 2 },
        { "сь", 2 }
};

static stem_table ru_verb[] = {
        { "ала", 2 },
        { "ана", 2 },
        { "аете", 3 },
        { "айте", 3 },
        { "али", 2 },
        { "ай", 1 },
        { "ал", 1 },
        { "аем", 2 },
        { "ан", 1 },
        { "ало", 2 },
        { "ано", 2 },
        { "ает", 2 },
        { "ают", 2 },
        { "аны", 2 },
        { "ать", 2 },
        { "аешь", 3 },
        { "анно", 3 },
        { "яла", 2 },
        { "яна", 2 },
        { "яете", 3 },
        { "яйте", 3 },
        { "яли", 2 },
        { "яй", 1 },
        { "ял", 1 },
        { "яем", 2 },
        { "ян", 1 },
        { "яло", 2 },
        { "яно", 2 },
        { "яет", 2 },
        { "яют", 2 },
        { "яны", 2 },
        { "ять", 2 },
        { "яешь", 3 },
        { "янно", 3 },

        { "ила", 3 },
        { "ыла", 3 },
        { "ена", 3 },
        { "ейте", 4 },
        { "уйте", 4 },
        { "ите", 3 },
        { "или", 3 },
        { "ыли", 3 },
        { "ей", 2 },
        { "уй", 2 },
        { "ил", 2 },
        { "ыл", 2 },
        { "им", 2 },
        { "ым", 2 },
        { "ен", 2 },
        { "ило", 3 },
        { "ыло", 3 },
        { "ено", 3 },
        { "ят", 2 },
        { "ует", 3 },
        { "уют", 3 },
        { "ит", 2 },
        { "ыт", 2 },
        { "ены", 3 },
        { "ить", 3 },
        { "ыть", 3 },
        { "ишь", 3 },
        { "ую", 2 },
        { "ю", 1 }
};

static stem_table ru_noun[] = {
        { "а", 1 },
        { "ев", 2 },
        { "ов", 2 },
        { "ие", 2 },
        { "ье", 2 },
        { "е", 1 },
        { "иями", 4 },
        { "ями", 3 },
        { "ами", 3 },
        { "еи", 2 },
        { "ии", 2 },
        { "и", 1 },
        { "ией", 3 },
        { "ей", 2 },
        { "ой", 2 },
        { "ий", 2 },
        { "иям", 3 },
        { "ям", 2 },
        { "ием", 3 },
        { "ам", 2 },
        { "ом", 2 },
        { "о", 1 },
        { "у", 1 },
        { "ах", 2 },
        { "иях", 3 },
        { "ях", 2 },
        { "ы", 1 },
        { "ь", 1 },
        { "ию", 2 },
        { "ью", 2 },
        { "ю", 1 },
        { "ия", 2 },
        { "ья", 2 },
        { "я", 1 }
};

static stem_table ru_super[] = {
        { "ейш", 3 },
        { "ейше", 4 }
};

static stem_table ru_deriv[] = {
        { "ост", 3 },
        { "ость", 4 }
};

int stem_ru_iv(unsigned char l)
{
        register unsigned char *v = stem_ru_vowels;

        while (*v && *v != l) v++;
        return (*v == l) ? 1 : 0;
}

int stem_ru_table(unsigned char *word, int *len, stem_table *table, int ntable)
{
        int i, j, k;

        for (i = 0; i < ntable; i++) {
                j = strlen(table[i].suffix)-1; // FIXME!!!
                k = (*len)-1;
                if (j > k) continue;
                for (; j >= 0; k--, j--)
                        if (word[k] != table[i].suffix[j]) break;
                if (j >= 0) continue;

                *len -= table[i].remove;
                return 1;
        }
        return 0;
}

#define STEM_RU_FUNC(func,table) \
        int func(char *word, int *len) \
        { \
                return stem_ru_table(word, len, \
                        table, sizeof(table) / sizeof(stem_table)); \
        }

STEM_RU_FUNC(stem_ru_gerund, ru_gerund)
STEM_RU_FUNC(stem_ru_adj, ru_adj)
STEM_RU_FUNC(stem_ru_part, ru_part)
STEM_RU_FUNC(stem_ru_reflex, ru_reflex)
STEM_RU_FUNC(stem_ru_verb, ru_verb)
STEM_RU_FUNC(stem_ru_noun, ru_noun)
STEM_RU_FUNC(stem_ru_super, ru_super)
STEM_RU_FUNC(stem_ru_deriv, ru_deriv)

int stem_ru_adjectival(unsigned char *word, int *len)
{
        if (stem_ru_adj(word, len)) {
                stem_ru_part(word, len);
                return 1;
        }
        return 0;
}

int stem_ru_verbal(unsigned char *word, int *len)
{
        if (stem_ru_reflex(word, len)) {
                if (stem_ru_verb(word, len)) return 1;
                if (stem_ru_adjectival(word, len)) return 1;
                if (stem_ru_noun(word, len)) return 1;
                return 1;
        }
        return stem_ru_verb(word, len);
}

void stem_ru(unsigned char *word)
{
        int end, rv, r1, r2;
        int i, len;

        len = strlen(word);
        rv = r1 = r2 = len;
        for (i = 0; i < len; i++)
                if (stem_ru_iv(word[i])) { rv = i+1; break; }
        if (rv == len) return;

        for (i = 0; i < len-1; i++)
                if (stem_ru_iv(word[i]) && !stem_ru_iv(word[i+1])) { r1 = i+2; break; }
        for (i = r1; i < len-1; i++)
                if (stem_ru_iv(word[i]) && !stem_ru_iv(word[i+1])) { r2 = i+2; break; }

        word += rv;
        len -= rv;
        r1 -= rv;
        r2 -= rv;

        while (1) {
                if (stem_ru_gerund(word, &len)) break;
                if (stem_ru_adjectival(word, &len)) break;
                if (stem_ru_verbal(word, &len)) break;
                if (stem_ru_noun(word, &len)) break;
                break;
        }

        if (len > 0 && (word[len-1] == 'й' || word[len-1] == 'и')) len--;

        len -= r2;
        stem_ru_deriv(word+r2, &len);
        len += r2;

        stem_ru_super(word, &len);
        if (len > 1 && word[len-2] == 'н' && word[len-1] == 'н') len--;
        if (word[len-1] == 'ь') len--;

        word[len] = 0;
}

void main()
{
        unsigned char buf[256];

        while (fgets(buf, sizeof(buf), stdin)) {
                if (buf[strlen(buf)-1] == '\n') buf[strlen(buf)-1] = 0;
                stem_ru(buf);
                printf("%s\n", buf);
        }
}
--- cut ---

Note: I do in-place stemming, so stem_ru_*() functions return 1
if there was any stemming, 0 otherwise, and actual "stemming"
is done by simply adjusting "len" variable.

First problem is as follows.

My implementation (shorthand: MY) is

int stem_ru_verbal(unsigned char *word, int *len)
{
        if (stem_ru_reflex(word, len)) {
                if (stem_ru_verb(word, len)) return 1;
                if (stem_ru_adjectival(word, len)) return 1;
                if (stem_ru_noun(word, len)) return 1;
                return 1;
        }
        return stem_ru_verb(word, len);
}

while I believe both the explanation and Snowball
source mean something like (shorthand: ORIG)

int stem_ru_verbal(unsigned char *word, int *len)
{
        int save = *len;
        
        if (stem_ru_reflex(word, len)) {
                if (stem_ru_verb(word, len)) return 1;
                if (stem_ru_adjectival(word, len)) return 1;
                *len = save; // this undoes stem_ru_reflex(word, len)
        }
        return stem_ru_verb(word, len);
}

The difference can be seen on the following examples:

"avos'": MY gives "av", ORIG gives "avos"
"bereglas'": MY gives "beregl", ORIG gives "bereglas"

Second problem (not only trailing "i" but "i'" should be
stemmed on step 2) can be seen on

"zmei'": MY gives "zme", ORIG gives "zmei'"
"znai'": MY gives "zna", ORIG gives "znai'"

In general, the problems are as follows:

1) MY implementation gives results matching perfectly
   with output.txt (which it should not);
2) ORIG implementation in turn does not (which it should);
3) Performing the algorithm by hand, I receive the
   very same results as with ORIG.

Thus, I'd be very grateful if you would show me the sequence
of stemming actions one should take according to the currently
published algorithm to reduce "zmei'" to "zme" and "bereglas'"
to "beregl" - I'll be able to find my error from that.

A simple example of such sequence:

word: vazhnei'shimi
Step 1: remove adjectival "imi"
Step 2: do nothing
Step 3: do nothing
Step 4: remove superlative "ei'sh"

A sequence I keep up getting for "bereglas'":

word: bereglas'
Step 1: remove noun ending "'"
Step 2: do nothing
Step 3: do nothing
Step 4: do nothing

The trouble here is that "gla" which precedes reflexive
"s'" and thus should be either a verb or adjectival
does not fit there. Thus, "s'" is treated as a noun
ending "'".

I hope my explanation of what's going on clear enough.

MP> P.S. Not related to the great Vassili Aksyonov, I suppose?
No, this surname is widespread enough in Russia.

- Andrew

_______________________________________________
Snowball-discuss mailing list
Snowball-discuss@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/snowball-discuss

_____________________________________________________________________
VirusChecked by the Incepta Group plc
_____________________________________________________________________



This archive was generated by hypermail 2.1.3 : Thu Sep 20 2007 - 12:02:40 BST