Stemmers for Irish Gaelic and Czech


 

Links to resources

Snowball main page
 oregan.tgz, Tar-gzipped file of resources
irish stemmer
lower case script
— and stopwords

Czech stemmer


In March 2012 we received a stemmer for Irish Gaelic from Jimmy O’Regan (joregan-at-gmail.com), together with a snowball implementation of the Czech stemmer of Ljiljana Dolamic.

Here is his original note,

Attached is a basic stemmer for Irish (and basic stopword list).

One thing that should be taken into account with Irish is the initial
mutation (n-eclipsis and h-prothesis), which causes problems with the
usual toupper and tolower. A snowball version would look something
like this:

define tolower_irish as (
 [substring] among (
   'nA' (<- 'n-a')
   'nE' (<- 'n-e')
   'nI' (<- 'n-i')
   'nO' (<- 'n-o')
   'nU' (<- 'n-u')
   'nÁ' (<- 'n-á')
   'nÉ' (<- 'n-é')
   'nÍ' (<- 'n-í')
   'nÓ' (<- 'n-ó')
   'nÚ' (<- 'n-ú')

   'tA' (<- 't-a')
   'tE' (<- 't-e')
   'tI' (<- 't-i')
   'tO' (<- 't-o')
   'tU' (<- 't-u')
   'tÁ' (<- 't-á')
   'tÉ' (<- 't-é')
   'tÍ' (<- 't-í')
   'tÓ' (<- 't-ó')
   'tÚ' (<- 't-ú')
 )
)



I've also attached my implementation of the Dolamic stemmer for Czech.

They can be distributed under the BSD licence, if you're willing to host them.

Here is Jimmy’s Irish stemmer in Snowball,


routines ( R1 R2 RV initial_morph mark_regions noun_sfx deriv verb_sfx ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* Latin 1 */ stringdef a' hex 'E1' // a-acute stringdef e' hex 'E9' // e-acute stringdef i' hex 'ED' // i-acute stringdef o' hex 'F3' // o-acute stringdef u' hex 'FA' // u-acute define v 'aeiou{a'}{e'}{i'}{o'}{u'}' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( gopast v setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define initial_morph as ( [substring] among ( 'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic (delete) // verbs 'd{'}' (delete) 'd{'}fh' (<- 'f') // other contractions 'm{'}' 'b{'}' (delete) 'sh' (<- 's') 'mb' (<- 'b') 'gc' (<- 'c') 'nd' (<- 'd') 'bhf' (<- 'f') 'ng' (<- 'g') 'bp' (<- 'p') 'ts' (<- 's') 'dt' (<- 't') // Lenition 'bh' (<- 'b') 'ch' (<- 'c') 'dh' (<- 'd') 'fh' (<- 'f') 'gh' (<- 'g') 'mh' (<- 'm') 'ph' (<- 'p') 'th' (<- 't') ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define noun_sfx as ( [substring] among ( 'amh' 'eamh' 'abh' 'eabh' 'aibh' 'ibh' 'aimh' 'imh' 'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta' (R1 delete) 'ire' 'ir{i'}' 'aire' 'air{i'}' (R2 delete) ) ) define deriv as ( [substring] among ( 'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta' (R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl 'arcacht' 'arcachta{i'}' 'arcachta' (<- 'arc') // monarcacht -> monarc 'gineach' 'gineas' 'ginis' (<- 'gin') 'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}' (<- 'graf') 'paite' 'patach' 'pataigh' 'patacha' (<- 'paite') '{o'}ideach' '{o'}ideacha' '{o'}idigh' (<- '{o'}id') ) ) define verb_sfx as ( [substring] among ( 'imid' 'aimid' '{i'}mid' 'a{i'}mid' 'faidh' 'fidh' (RV delete) 'ain' 'eadh' 'adh' '{a'}il' 'tear' 'tar' (R1 delete) ) ) ) define stem as ( do initial_morph do mark_regions backwards ( do noun_sfx do deriv do verb_sfx ) )


And here is his implementation of Dolamic’s Czech stemmer,


routines ( RV R1 palatalise mark_regions do_possessive do_case do_comparative do_diminutive do_augmentative do_derivational do_deriv_single do_aggressive ) externals ( stem ) integers ( pV p1 ) groupings ( v ) stringescapes {} /* Latin 2 */ stringdef a' hex 'E1' stringdef c^ hex 'E8' stringdef d^ hex 'EF' stringdef e' hex 'E9' stringdef e^ hex 'EC' stringdef i' hex 'ED' stringdef n^ hex 'F2' stringdef o' hex 'F3' stringdef r^ hex 'F8' stringdef s^ hex 'B9' stringdef t^ hex 'BB' stringdef u' hex 'FA' stringdef u* hex 'F9' stringdef y' hex 'FD' stringdef z^ hex 'BE' define v 'aeiouy{a'}{e^}{e'}{i'}{o'}{u'}{u*}{y'}' define mark_regions as ( $pV = limit $p1 = limit do ( gopast non-v setmark pV gopast non-v gopast v setmark p1 ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define palatalise as ( [substring] RV among ( 'ci' 'ce' '{c^}i' '{c^}' (<- 'k') 'zi' 'ze' '{z^}i' '{z^}e' (<- 'h') '{c^}t{e^}' '{c^}ti' '{c^}t{e'}' (<- 'ck') '{s^}t{e^}' '{s^}ti' '{s^}t{e'}' (<- 'sk') ) ) define do_possessive as ( [substring] RV among ( 'ov' '{u*}v' (delete) 'in' ( delete try palatalise ) ) ) define do_case as ( [substring] among ( 'atech' '{e^}tem' 'at{u*}m' '{a'}ch' '{y'}ch' 'ov{e'}' '{y'}mi' 'ata' 'aty' 'ama' 'ami' 'ovi' 'at' '{a'}m' 'os' 'us' '{y'}m' 'mi' 'ou' 'u' 'y' '{u*}' 'a' 'o' '{a'}' '{e'}' '{y'}' (delete) 'ech' 'ich' '{i'}ch' '{e'}ho' '{e^}mi' '{e'}mu' '{e^}te' '{e^}ti' '{i'}ho' '{i'}mi' 'emi' 'iho' 'imu' '{e'}m' '{i'}m' 'es' 'e' 'i' '{i'}' '{e^}' ( delete try palatalise ) 'em' ( <- 'e' try palatalise ) ) ) define do_derivational as ( [substring] R1 among ( 'obinec' 'ovisk' 'ovstv' 'ovi{s^}t' 'ovn{i'}k' '{a'}sek' 'loun' 'nost' 'teln' 'ovec' 'ov{i'}k' 'ovtv' 'ovin' '{s^}tin' '{a'}rn' 'och' 'ost' 'ovn' 'oun' 'out' 'ou{s^}' 'u{s^}k' 'kyn' '{c^}an' 'k{a'}{r^}' 'n{e'}{r^}' 'n{i'}k' 'ctv' 'stv' '{a'}{c^}' 'a{c^}' '{a'}n' 'an' '{a'}{r^}' 'as' 'ob' 'ot' 'ov' 'o{n^}' 'ul' 'yn' '{c^}k' '{c^}n' 'dl' 'nk' 'tv' 'tk' 'vk' (delete) 'ion{a'}{r^}' 'inec' 'itel' 'i{a'}n' 'ist' 'isk' 'i{s^}k' 'itb' 'ic' 'in' 'it' 'iv' ( <- 'i' palatalise ) 'enic' 'ec' 'en' ( <- 'e' palatalise ) '{e'}{r^}' ( <- '{e'}' palatalise ) '{e^}n' ( <- '{e^}' palatalise ) '{i'}rn' '{i'}{r^}' '{i'}n' ( <- '{i'}' palatalise ) ) ) define do_deriv_single as ( [substring] among ( 'c' '{c^}' 'k' 'l' 'n' 't' (delete) ) ) define do_augmentative as ( [substring] among ( 'ajzn' '{a'}k' (delete) 'izn' 'isk' ( <- 'i' palatalise ) ) ) define do_diminutive as ( [substring] among ( 'ou{s^}ek' '{a'}{c^}ek' 'a{c^}ek' 'o{c^}ek' 'u{c^}ek' 'anek' 'onek' 'unek' '{a'}nek' 'e{c^}k' '{e'}{c^}k' 'i{c^}k' '{i'}{c^}k' 'enk' '{e'}nk' 'ink' '{i'}nk' '{a'}{c^}k' 'a{c^}k' 'o{c^}k' 'u{c^}k' 'ank' 'onk' 'unk' '{a'}tk' '{a'}nk' 'u{s^}k' 'k' (delete) 'e{c^}ek' 'enek' 'ek' ( <- 'e' palatalise ) '{e'}{c^}ek' '{e'}k' ( <- '{e'}' palatalise ) 'i{c^}ek' 'inek' 'ik' ( <- 'i' palatalise ) '{i'}{c^}ek' '{i'}k' ( <- '{i'}' palatalise ) '{a'}k' (<- '{a'}') 'ak' (<- 'a') 'ok' (<- 'o') 'uk' (<- 'u') ) ) define do_comparative as ( [substring] among ( '{e^}j{s^}' ( <- '{e^}' palatalise ) 'ej{s^}' ( <- 'e' palatalise ) ) ) define do_aggressive as ( do do_comparative do do_diminutive do do_augmentative do_derivational or do_deriv_single ) ) define stem as ( do mark_regions backwards ( do_case do_possessive // light and aggressive are the same to this point // comment next line for light stemmer do_aggressive ) ) // Ljiljana Dolamic and Jacques Savoy. 2009. // Indexing and stemming approaches for the Czech language. // Inf. Process. Manage. 45, 6 (November 2009), 714-720. // http://members.unine.ch/jacques.savoy/clef/CzechStemmerLight.txt // http://members.unine.ch/jacques.savoy/clef/CzechStemmerAgressive.txt