(For the background to this work, see the
credits page. Following earlier misgivings on the wisdom
of removing IST/ISM endings, in this stemmer they are now conflated to a single
form. It can easily be modified it to bring it in line with the other Romance
stemmers: see the internal comments marked ‘IST’.
It is assumed that hyphenated forms are split into separate words prior to
stemming.)
The stemming algorithm
Letters in Romanian include the following accented forms,
-
ă â î ş ţ
The following letters are vowels:
-
a ă â e i î o u
First, i and u between vowels are put into upper case
(so that they are treated as consonants).
R1, R2
(see the note on R1 and R2)
and RV then have the same definition as in the
Spanish stemmer.
Always do steps 0, 1, 2 and 4. (Step 3 is conditional on steps 1 and 2.)
Step 0: Removal of plurals (and other simplifications)
-
Search for the longest among the following suffixes, and, if
it is in R1, perform the
action indicated.
- ul ului
- delete
- aua
- replace with a
- ea ele elor
- replace with e
- ii iua iei iile iilor ilor
- replace with i
- ile
- replace with i if not preceded by ab
- atei
- replace with at
- aţie aţia
- replace with aţi
Step 1: Reduction of combining suffixes
-
Search for the longest among the following suffixes, and, if
it is in R1, preform the replacement action indicated.
Then repeat this step until no replacement occurs.
- abilitate abilitati abilităi abilităţi
- replace with abil
- ibilitate
- replace with ibil
- ivitate ivitati ivităi ivităţi
- replace with iv
- icitate icitati icităi icităţi
icator icatori
iciv iciva icive icivi icivă
ical icala icale icali icală
- replace with ic
- ativ ativa ative ativi ativă aţiune
atoare ator atori
ătoare ător ători
- replace with at
- itiv itiva itive itivi itivă iţiune
itoare itor itori
- replace with it
Step 2: Removal of ‘standard’ suffixes
-
Search for the longest among the following suffixes, and, if
it is in R2, perform the action indicated.
- at ata ată ati ate
ut uta ută uti ute
it ita ită iti ite
ic ica ice ici ică
abil abila abile abili abilă
ibil ibila ibile ibili ibilă
oasa oasă oase os osi oşi
ant anta ante anti antă
ator atori
itate itati ităi ităţi
iv iva ive ivi ivă
- delete
- iune iuni
- delete if preceded by ţ, and replace the ţ by t.
- ism isme
ist ista iste isti istă işti
- replace with ist
Do step 3 if no suffix was removed either by step 1 or step 2.
Step 3: Removal of verb suffixes
-
Search for the longest suffix in region RV among the following,
and perform the action indicated.
- are ere ire âre
ind ând
indu ându
eze
ească
ez ezi ează esc eşti
eşte
ăsc ăşti
ăşte
am ai au
eam eai ea eaţi eau
iam iai ia iaţi iau
ui
aşi arăm arăţi ară
uşi urăm urăţi ură
işi irăm irăţi iră
âi âşi ârăm ârăţi âră
asem aseşi ase aserăm aserăţi aseră
isem iseşi ise iserăm iserăţi iseră
âsem âseşi âse âserăm âserăţi âseră
usem useşi use userăm userăţi useră
- delete if preceded in RV by a consonant or u
- ăm aţi
em eţi
im iţi
âm âţi
seşi serăm serăţi seră
sei se
sesem seseşi sese seserăm seserăţi seseră
- delete
Step 4: Removal of final vowel
-
Search for the longest among the suffixes
-
a e i ie ă
and, if it is in RV, delete it.
And finally:
-
Turn I, U back into i, u.
|
The same algorithm in Snowball (Unicode version)
-
routines (
prelude postlude mark_regions
RV R1 R2
step_0
standard_suffix combo_suffix
verb_suffix
vowel_suffix
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
booleans ( standard_suffix_removed )
stringescapes {}
/* special characters */
stringdef a^ hex '0E2' // a circumflex
stringdef i^ hex '0EE' // i circumflex
stringdef a+ hex '103' // a breve
stringdef s, hex '15F' // s cedilla
stringdef t, hex '163' // t cedilla
define v 'aeiou{a^}{i^}{a+}'
define prelude as (
repeat goto (
v [ ('u' ] v <- 'U') or
('i' ] v <- 'I')
)
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'' (next)
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define step_0 as (
[substring] R1 among(
'ul' 'ului'
( delete )
'aua'
( <-'a' )
'ea' 'ele' 'elor'
( <-'e' )
'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
( <-'i')
'ile'
( not 'ab' <- 'i' )
'atei'
( <- 'at' )
'a{t,}ie' 'a{t,}ia'
( <- 'a{t,}i' )
)
)
define combo_suffix as test (
[substring] R1 (
among(
/* 'IST'. alternative: include the following
'alism' 'alisme'
'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
<- 'al'
)
*/
'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
<- 'abil'
)
'ibilitate' (
<- 'ibil'
)
'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
<- 'iv'
)
'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
'icator' 'icatori'
'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
<- 'ic'
)
'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
'atoare' 'ator' 'atori'
'{a+}toare' '{a+}tor' '{a+}tori' (
<- 'at'
)
'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
'itoare' 'itor' 'itori' (
<- 'it'
)
)
set standard_suffix_removed
)
)
define standard_suffix as (
unset standard_suffix_removed
repeat combo_suffix
[substring] R2 (
among(
// past participle is treated here, rather than
// as a verb ending:
'at' 'ata' 'at{a+}' 'ati' 'ate'
'ut' 'uta' 'ut{a+}' 'uti' 'ute'
'it' 'ita' 'it{a+}' 'iti' 'ite'
'ic' 'ica' 'ice' 'ici' 'ic{a+}'
'abil' 'abila' 'abile' 'abili' 'abil{a+}'
'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
'ant' 'anta' 'ante' 'anti' 'ant{a+}'
'ator' 'atori'
'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
delete
)
'iune' 'iuni' (
'{t,}'] <- 't'
)
'ism' 'isme'
'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
<- 'ist'
/* 'IST'. alternative: remove with <- '' */
)
)
set standard_suffix_removed
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among(
// 'long' infinitive:
'are' 'ere' 'ire' '{a^}re'
// gerund:
'ind' '{a^}nd'
'indu' '{a^}ndu'
'eze'
'easc{a+}'
// present:
'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
'e{s,}te'
'{a+}sc' '{a+}{s,}ti'
'{a+}{s,}te'
// imperfect:
'am' 'ai' 'au'
'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
'iam' 'iai' 'ia' 'ia{t,}i' 'iau'
// past: // (not 'ii')
'ui'
'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
'{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'
// pluferfect:
'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
'{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
'{a^}ser{a+}'
'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'
( non-v or 'u' delete )
// present:
'{a+}m' 'a{t,}i'
'em' 'e{t,}i'
'im' 'i{t,}i'
'{a^}m' '{a^}{t,}i'
// past:
'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
'sei' 'se'
// pluperfect:
'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
(delete)
)
)
define vowel_suffix as (
[substring] RV among (
'a' 'e' 'i' 'ie' '{a+}' ( delete )
)
)
)
define stem as (
do prelude
do mark_regions
backwards (
do step_0
do standard_suffix
do ( standard_suffix_removed or verb_suffix )
do vowel_suffix
)
do postlude
)
|