Stemmers for Irish Gaelic and Czech
Links to resources
In March 2012 we received a stemmer for Irish Gaelic from Jimmy O’Regan
(joregan-at-gmail.com),
together with a snowball implementation of the Czech stemmer of Ljiljana
Dolamic.
Here is his original note,
Attached is a basic stemmer for Irish (and basic stopword list).
One thing that should be taken into account with Irish is the initial
mutation (n-eclipsis and h-prothesis), which causes problems with the
usual toupper and tolower. A snowball version would look something
like this:
define tolower_irish as (
[substring] among (
'nA' (<- 'n-a')
'nE' (<- 'n-e')
'nI' (<- 'n-i')
'nO' (<- 'n-o')
'nU' (<- 'n-u')
'nÁ' (<- 'n-á')
'nÉ' (<- 'n-é')
'nÍ' (<- 'n-í')
'nÓ' (<- 'n-ó')
'nÚ' (<- 'n-ú')
'tA' (<- 't-a')
'tE' (<- 't-e')
'tI' (<- 't-i')
'tO' (<- 't-o')
'tU' (<- 't-u')
'tÁ' (<- 't-á')
'tÉ' (<- 't-é')
'tÍ' (<- 't-í')
'tÓ' (<- 't-ó')
'tÚ' (<- 't-ú')
)
)
I've also attached my implementation of the Dolamic stemmer for Czech.
They can be distributed under the BSD licence, if you're willing to host them.
Here is Jimmy’s Irish stemmer in Snowball,
|
-
routines (
R1 R2 RV
initial_morph
mark_regions
noun_sfx
deriv
verb_sfx
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
stringescapes {}
/* Latin 1 */
stringdef a' hex 'E1' // a-acute
stringdef e' hex 'E9' // e-acute
stringdef i' hex 'ED' // i-acute
stringdef o' hex 'F3' // o-acute
stringdef u' hex 'FA' // u-acute
define v 'aeiou{a'}{e'}{i'}{o'}{u'}'
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
gopast v setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define initial_morph as (
[substring] among (
'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic
(delete)
// verbs
'd{'}'
(delete)
'd{'}fh'
(<- 'f')
// other contractions
'm{'}' 'b{'}'
(delete)
'sh'
(<- 's')
'mb'
(<- 'b')
'gc'
(<- 'c')
'nd'
(<- 'd')
'bhf'
(<- 'f')
'ng'
(<- 'g')
'bp'
(<- 'p')
'ts'
(<- 's')
'dt'
(<- 't')
// Lenition
'bh'
(<- 'b')
'ch'
(<- 'c')
'dh'
(<- 'd')
'fh'
(<- 'f')
'gh'
(<- 'g')
'mh'
(<- 'm')
'ph'
(<- 'p')
'th'
(<- 't')
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define noun_sfx as (
[substring] among (
'amh' 'eamh' 'abh' 'eabh'
'aibh' 'ibh' 'aimh' 'imh'
'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta'
(R1 delete)
'ire' 'ir{i'}' 'aire' 'air{i'}'
(R2 delete)
)
)
define deriv as (
[substring] among (
'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta'
(R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl
'arcacht' 'arcachta{i'}' 'arcachta'
(<- 'arc') // monarcacht -> monarc
'gineach' 'gineas' 'ginis'
(<- 'gin')
'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}'
(<- 'graf')
'paite' 'patach' 'pataigh' 'patacha'
(<- 'paite')
'{o'}ideach' '{o'}ideacha' '{o'}idigh'
(<- '{o'}id')
)
)
define verb_sfx as (
[substring] among (
'imid' 'aimid' '{i'}mid' 'a{i'}mid'
'faidh' 'fidh'
(RV delete)
'ain'
'eadh' 'adh'
'{a'}il'
'tear' 'tar'
(R1 delete)
)
)
)
define stem as (
do initial_morph
do mark_regions
backwards (
do noun_sfx
do deriv
do verb_sfx
)
)
|
And here is his implementation of Dolamic’s Czech stemmer,
|
-
routines (
RV R1
palatalise
mark_regions
do_possessive
do_case
do_comparative
do_diminutive
do_augmentative
do_derivational
do_deriv_single
do_aggressive
)
externals ( stem )
integers ( pV p1 )
groupings ( v )
stringescapes {}
/* Latin 2 */
stringdef a' hex 'E1'
stringdef c^ hex 'E8'
stringdef d^ hex 'EF'
stringdef e' hex 'E9'
stringdef e^ hex 'EC'
stringdef i' hex 'ED'
stringdef n^ hex 'F2'
stringdef o' hex 'F3'
stringdef r^ hex 'F8'
stringdef s^ hex 'B9'
stringdef t^ hex 'BB'
stringdef u' hex 'FA'
stringdef u* hex 'F9'
stringdef y' hex 'FD'
stringdef z^ hex 'BE'
define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}'
define mark_regions as (
$pV = limit
$p1 = limit
do (
gopast non-v setmark pV
gopast non-v gopast v setmark p1
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define palatalise as (
[substring] RV among (
'ci' 'ce' '{c^}i' '{c^}'
(<- 'k')
'zi' 'ze' '{z^}i' '{z^}e'
(<- 'h')
'{c^}t{e^}' '{c^}ti' '{c^}t{e'}'
(<- 'ck')
'{s^}t{e^}' '{s^}ti' '{s^}t{e'}'
(<- 'sk')
)
)
define do_possessive as (
[substring] RV among (
'ov' '{u*}v'
(delete)
'in'
(
delete
try palatalise
)
)
)
define do_case as (
[substring] among (
'atech'
'{e^}tem' 'at{u*}m'
'{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi'
'ata' 'aty' 'ama' 'ami' 'ovi'
'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou'
'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}'
(delete)
'ech' 'ich' '{i'}ch'
'{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi'
'emi' 'iho' 'imu'
'{e'}m' '{i'}m' 'es'
'e' 'i' '{i'}' '{e^}'
(
delete
try palatalise
)
'em'
(
<- 'e'
try palatalise
)
)
)
define do_derivational as (
[substring] R1 among (
'obinec'
'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k'
'{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin'
'{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k'
'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv'
'{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as'
'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn'
'{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk'
(delete)
'ion{a'}{r^}'
'inec' 'itel'
'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb'
'ic' 'in' 'it' 'iv'
(
<- 'i'
palatalise
)
'enic' 'ec' 'en'
(
<- 'e'
palatalise
)
'{e'}{r^}'
(
<- '{e'}'
palatalise
)
'{e^}n'
(
<- '{e^}'
palatalise
)
'{i'}rn'
'{i'}{r^}' '{i'}n'
(
<- '{i'}'
palatalise
)
)
)
define do_deriv_single as (
[substring] among (
'c' '{c^}' 'k' 'l' 'n' 't'
(delete)
)
)
define do_augmentative as (
[substring] among (
'ajzn' '{a'}k'
(delete)
'izn' 'isk'
(
<- 'i'
palatalise
)
)
)
define do_diminutive as (
[substring] among (
'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek'
'anek' 'onek' 'unek' '{a'}nek'
'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk'
'{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk'
'{a'}tk' '{a'}nk' 'u{s^}k'
'k'
(delete)
'e{c^}ek' 'enek' 'ek'
(
<- 'e'
palatalise
)
'{e'}{c^}ek' '{e'}k'
(
<- '{e'}'
palatalise
)
'i{c^}ek' 'inek' 'ik'
(
<- 'i'
palatalise
)
'{i'}{c^}ek' '{i'}k'
(
<- '{i'}'
palatalise
)
'{a'}k'
(<- '{a'}')
'ak'
(<- 'a')
'ok'
(<- 'o')
'uk'
(<- 'u')
)
)
define do_comparative as (
[substring] among (
'{e^}j{s^}'
(
<- '{e^}'
palatalise
)
'ej{s^}'
(
<- 'e'
palatalise
)
)
)
define do_aggressive as (
do do_comparative
do do_diminutive
do do_augmentative
do_derivational or do_deriv_single
)
)
define stem as (
do mark_regions
backwards (
do_case
do_possessive
// light and aggressive are the same to this point
// comment next line for light stemmer
do_aggressive
)
)
// Ljiljana Dolamic and Jacques Savoy. 2009.
// Indexing and stemming approaches for the Czech language.
// Inf. Process. Manage. 45, 6 (November 2009), 714-720.
// http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt
// http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt
|