[Snowball-discuss] a simple algorithm problem

From: ayhan peker (ayhan@aramanet.com)
Date: Sun Dec 12 2004 - 00:27:29 GMT


Hi there
I have been working on a simple turkish stemming algorithm. I am having
some problems with special turkish characters . Simply i am just trying
to replace long words (more then 15 chars) with (l/w) and some of
turkish suffixes.
My problem is where there are turkish chars (ie u") the algorithm is not
working.
when i test is with my code with postgres :
 select lexize('tr12','asaiitılardıa');
 lexize
--------
 {l/w}
(1 row)
 
 select lexize('tr12','asaiitalardaa');
     lexize
-----------------
 {asaiitalardaa}
(1 row)
 number of chars in first query and the second one is the same but first
one is wrong whereas second one is right.

I wonder if somebody can help me

thanks in advance.

ayhan peker

 here is my code :

routines (
           mark_regions
           R1 R2
           common_suffix
          
)
externals ( stem )
integers ( p1 p2 p3)
groupings ( v all )
stringescapes {}

/* special characters (in turkish) */

stringdef u" hex 'FC' // u w�th d�aer�es
stringdef i^ hex 'FD' //
stringdef o" hex 'F6' //
stringdef s, hex 'FE' //
stringdef c, hex 'E7' //
stringdef g^ hex 'F0' //

define v 'aeiou{u"}{o"}{i^}'

define all
'aeiouqwrtyplkjhgfdszxcvbnm1234567890!£$%^&*()-_=+[]@~;:/?><#'

define mark_regions as (
    $p1 = limit
    $p2 = limit // defaults
    $p3=size
    do (
        ( gopast v gopast non-v) setmark p1
         ( gopast v gopast non-v) setmark p2
        
    )
    
)
backwardmode (
    define R1 as $p1 <= cursor
    define R2 as $p2 <= cursor
   

    define common_suffix as (
        [substring] among(
        'ler' 'lar' 'diler' 'dular' 'd{i^}lar' 'd{u"}ler' 'tiler' 'tular'
't{i^}lar' 't{u"}ler' 'dir' 'd{i^}r' 'mi{s,}' 'm{i^}{s,}' 'm{u"}{s,}'
'mu{s,}'
        'mi{s,}ler' 'm{i^}{s,}lar' 'm{u"}{s,}ler' 'mu{s,}lar'
                 (R1 or R2 delete)
            )
    )
)

define stem as (
    
    do mark_regions
    backwards (
            (
                  do common_suffix
        )
        do(
//delete all if the lengt is more than 15 chars
                $p3>15
                repeat ( gopast([all] or gopast(v) ) delete )
                do insert 'l/w'
        )
    )
)

-- 
ayhan@aramanet.com
www.aramanet.com
En fazla websayfasi iceren turkce arama motoru 



This archive was generated by hypermail 2.1.3 : Thu Sep 20 2007 - 12:02:46 BST