/* Stemmer for Turkish * author: Evren (Kapusuz) Çilden * email: evren.kapusuz at gmail.com * version: 1.0 (15.01.2007) * stems nominal verb suffixes * stems nominal inflections * more than one syllable word check * (y,n,s,U) context check * vowel harmony check * last consonent check and conversion (b, c, d, ğ to p, ç, t, k) * The stemming algorithm is based on the paper "An Affix Stripping * Morphological Analyzer for Turkish" by Gülşen Eryiğit and * Eşref Adalı (Proceedings of the IAESTED International Conference * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, * Innsbruck, Austria * Turkish is an agglutinative language and has a very rich morphological * structure. In Turkish, you can form many different words from a single stem * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means * "You had been the doctor of him". The stem of the word is "doktor" and it * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about * the append order of suffixes can be clearly described as FSMs. * The paper referenced above defines some FSMs for right to left * morphological analysis. I generated a method for constructing snowball * expressions from right to left FSMs for stemming suffixes. */ routines ( append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings check_vowel_harmony // tests vowel harmony for suffixes is_reserved_word // tests whether current string is a reserved word ('ad','soyad') mark_cAsInA // nominal verb suffix mark_DA // noun suffix mark_DAn // noun suffix mark_DUr // nominal verb suffix mark_ki // noun suffix mark_lAr // noun suffix, nominal verb suffix mark_lArI // noun suffix mark_nA // noun suffix mark_ncA // noun suffix mark_ndA // noun suffix mark_ndAn // noun suffix mark_nU // noun suffix mark_nUn // noun suffix mark_nUz // nominal verb suffix mark_sU // noun suffix mark_sUn // nominal verb suffix mark_sUnUz // nominal verb suffix mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz, mark_yA // noun suffix mark_ylA // noun suffix mark_yU // noun suffix mark_yUm // nominal verb suffix mark_yUz // nominal verb suffix mark_yDU // nominal verb suffix mark_yken // nominal verb suffix mark_ymUs_ // nominal verb suffix mark_ysA // nominal verb suffix mark_suffix_with_optional_y_consonant mark_suffix_with_optional_U_vowel mark_suffix_with_optional_n_consonant mark_suffix_with_optional_s_consonant more_than_one_syllable_word post_process_last_consonants postlude stem_nominal_verb_suffixes stem_noun_suffixes stem_suffix_chain_before_ki ) /* Special characters in Unicode Latin-1 and Latin Extended-A */ stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS stringescapes { } integers ( strlen ) // length of a string booleans ( continue_stemming_noun_suffixes ) groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6) define vowel 'ae{i'}io{o"}u{u"}' define U '{i'}iu{u"}' // the vowel grouping definitions below are used for checking vowel harmony define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a' define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i'' define vowel4 'ei' // vowels that can end with suffixes containing 'i' define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"' externals ( stem ) backwardmode ( // checks vowel harmony for possible suffixes, // helps to detect whether the candidate for suffix applies to vowel harmony // this rule is added to prevent over stemming define check_vowel_harmony as ( test ( (goto vowel) // if there is a vowel ( ('a' goto vowel1) or ('e' goto vowel2) or ('{i'}' goto vowel3) or ('i' goto vowel4) or ('o' goto vowel5) or ('{o"}' goto vowel6) or ('u' goto vowel5) or ('{u"}' goto vowel6) ) ) ) // if the last consonant before suffix is vowel and n then advance and delete // if the last consonant before suffix is non vowel and n do nothing // if the last consonant before suffix is not n then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_n_consonant as ( ((test 'n') next (test vowel)) or ((not(test 'n')) test(next (test vowel))) ) // if the last consonant before suffix is vowel and s then advance and delete // if the last consonant before suffix is non vowel and s do nothing // if the last consonant before suffix is not s then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_s_consonant as ( ((test 's') next (test vowel)) or ((not(test 's')) test(next (test vowel))) ) // if the last consonant before suffix is vowel and y then advance and delete // if the last consonant before suffix is non vowel and y do nothing // if the last consonant before suffix is not y then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_y_consonant as ( ((test 'y') next (test vowel)) or ((not(test 'y')) test(next (test vowel))) ) define mark_suffix_with_optional_U_vowel as ( ((test U) next (test non-vowel)) or ((not(test U)) test(next (test non-vowel))) ) define mark_possessives as ( among ('m{i'}z' 'miz' 'muz' 'm{u"}z' 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n') (mark_suffix_with_optional_U_vowel) ) define mark_sU as ( check_vowel_harmony U (mark_suffix_with_optional_s_consonant) ) define mark_lArI as ( among ('leri' 'ları') ) define mark_yU as ( check_vowel_harmony U (mark_suffix_with_optional_y_consonant) ) define mark_nU as ( check_vowel_harmony among ('n{i'}' 'ni' 'nu' 'n{u"}') ) define mark_nUn as ( check_vowel_harmony among ('{i'}n' 'in' 'un' '{u"}n') (mark_suffix_with_optional_n_consonant) ) define mark_yA as ( check_vowel_harmony among('a' 'e') (mark_suffix_with_optional_y_consonant) ) define mark_nA as ( check_vowel_harmony among('na' 'ne') ) define mark_DA as ( check_vowel_harmony among('da' 'de' 'ta' 'te') ) define mark_ndA as ( check_vowel_harmony among('nda' 'nde') ) define mark_DAn as ( check_vowel_harmony among('dan' 'den' 'tan' 'ten') ) define mark_ndAn as ( check_vowel_harmony among('ndan' 'nden') ) define mark_ylA as ( check_vowel_harmony among('la' 'le') (mark_suffix_with_optional_y_consonant) ) define mark_ki as ( 'ki' ) define mark_ncA as ( check_vowel_harmony among('ca' 'ce') (mark_suffix_with_optional_n_consonant) ) define mark_yUm as ( check_vowel_harmony among ('{i'}m' 'im' 'um' '{u"}m') (mark_suffix_with_optional_y_consonant) ) define mark_sUn as ( check_vowel_harmony among ('s{i'}n' 'sin' 'sun' 's{u"}n' ) ) define mark_yUz as ( check_vowel_harmony among ('{i'}z' 'iz' 'uz' '{u"}z') (mark_suffix_with_optional_y_consonant) ) define mark_sUnUz as ( among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z') ) define mark_lAr as ( check_vowel_harmony among ('ler' 'lar') ) define mark_nUz as ( check_vowel_harmony among ('n{i'}z' 'niz' 'nuz' 'n{u"}z') ) define mark_DUr as ( check_vowel_harmony among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r') ) define mark_cAsInA as ( among ('cas{i'}na' 'cesine') ) define mark_yDU as ( check_vowel_harmony among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m' 't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n' 't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k' 't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}') (mark_suffix_with_optional_y_consonant) ) // does not fully obey vowel harmony define mark_ysA as ( among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se') (mark_suffix_with_optional_y_consonant) ) define mark_ymUs_ as ( check_vowel_harmony among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}') (mark_suffix_with_optional_y_consonant) ) define mark_yken as ( 'ken' (mark_suffix_with_optional_y_consonant) ) define stem_nominal_verb_suffixes as ( [ set continue_stemming_noun_suffixes (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken) or (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_) or ( mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_)) unset continue_stemming_noun_suffixes ) or (mark_nUz (mark_yDU or mark_ysA)) or ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_)) or (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)) ]delete ) // stems noun suffix chains ending with -ki define stem_suffix_chain_before_ki as ( [ mark_ki ( (mark_DA] delete try([ (mark_lAr] delete try(stem_suffix_chain_before_ki)) or (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) )) or (mark_nUn] delete try([ (mark_lArI] delete) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) )) or (mark_ndA ( (mark_lArI] delete) or ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki))) or (stem_suffix_chain_before_ki) )) ) ) define stem_noun_suffixes as ( ([mark_lAr] delete try(stem_suffix_chain_before_ki)) or ([mark_ncA] delete try( ([mark_lArI] delete) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or ([mark_lAr] delete stem_suffix_chain_before_ki) ) ) or ([(mark_ndA or mark_nA) ( (mark_lArI] delete) or (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) ) ) or ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI))) or ( [mark_DAn] delete try ([ ( (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lAr] delete try(stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) )) ) or ([mark_nUn or mark_ylA] delete try( ([mark_lAr] delete stem_suffix_chain_before_ki) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or stem_suffix_chain_before_ki ) ) or ([mark_lArI] delete) or (stem_suffix_chain_before_ki) or ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki)) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) ) define post_process_last_consonants as ( [substring] among ( 'b' (<- 'p') 'c' (<- '{c.}') 'd' (<- 't') '{g~}' (<- 'k') ) ) // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed // like in 'kedim' -> 'ked' // Turkish words don't usually end with 'd' or 'g' // some very well known words are ignored (like 'ad' 'soyad' // appends U to stems ending with d or g, decides which vowel to add // based on the last vowel in the stem define append_U_to_stems_ending_with_d_or_g as ( test('d' or 'g') (test((goto vowel) 'a' or '{i'}') <+ '{i'}') or (test((goto vowel) 'e' or 'i') <+ 'i') or (test((goto vowel) 'o' or 'u') <+ 'u') or (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}') ) ) // Tests if there are more than one syllables // In Turkish each vowel indicates a distinct syllable define more_than_one_syllable_word as ( test (atleast 2 (gopast vowel)) ) define is_reserved_word as ( test(gopast 'ad' ($strlen = 2) ($strlen == limit)) or test(gopast 'soyad' ($strlen = 5) ($strlen == limit)) ) define postlude as ( not(is_reserved_word) backwards ( do append_U_to_stems_ending_with_d_or_g do post_process_last_consonants ) ) define stem as ( (more_than_one_syllable_word) ( backwards ( do stem_nominal_verb_suffixes continue_stemming_noun_suffixes do stem_noun_suffixes ) postlude ) )