#!/usr/bin/perl -w use strict; package Lingua::Stem::Reference::Porter1; #use Benchmark qw(timeit timestr ); =pod =head1 NAME perl.txt - Matin Porter's Stemmer Release 1 for Perl =head1 DESCRIPTION Porter stemmer in Perl. Easy to follow against the rules in the original paper, and subsequent changes to release 1 available on the website listed below. Inputs taken from the files on the arg list, output to stdout. As an easy speed-up, one might create a hash of word=>stemmed form, and look up each new word in the hash, only calling stem() if the word was not found there. There are many Perl stemmers available at this time in module form, although some have varying degrees of accuracy. This stemmer is meant to be a reference implementation of the Porter Stemmer Release 1. =head1 SYNOPSIS Here are some examples: $ # Try test dictionary (available from website): $ perl perl.txt test.txt $ diff output.txt test.txt $ $ # Interactive test: $ perl perl.txt baking bake^D $ =head1 AUTHOR Martin Porter . Modified Perl script by Allan Fields . =head1 COPYRIGHT Copyright (C) Martin Porter, 1980. =cut # Some stemmer definition, set things up: my $VALID = '[:alpha:]'; # grab alphas from input my $c = '[^aeiou]'; # consonant my $v = '[aeiouy]'; # vowel my $C = $c . '[^aeiouy]*'; # consonant sequence my $V = $v . '[aeiou]*'; # vowel sequence my $mgr0 =qr{^ (?:$C)? $V $C }x; # [C]VC... is m>0 my $meq1 =qr{^ (?:$C)? $V $C ($V)? $}x; # [C]VC[V] is m=1 my $mgr1 =qr{^ (?:$C)? $V $C $V $C }x; # [C]VCVC... is m>1 my $_v =qr{^ (?:$C)? $v }x; # vowel in stem my %step2list = ( ational => 'ate', tional => 'tion', enci => 'ence', anci => 'ance', izer => 'ize', bli => 'ble', alli => 'al', entli => 'ent', eli => 'e', ousli => 'ous', ization => 'ize', ation => 'ate', ator => 'ate', alism => 'al', iveness => 'ive', fulness => 'ful', ousness => 'ous', aliti => 'al', iviti => 'ive', biliti => 'ble', logi => 'log' ); my $STEP2 = '(' . (join '|', sort keys %step2list) . ')'; my %step3list = ( icate => 'ic', ative => '', alize => 'al', iciti => 'ic', ical => 'ic', ful => '', ness => '' ); my $STEP3 = '(' . (join '|', sort keys %step3list) . ')'; my @step4list =qw( al ance ence er ic able ible ant ement ment ent ou ism ate iti ous ive ize ); my $STEP4 = '(' . (join '|', sort @step4list) . ')'; #my $STEP4 = '(' . (join '|', @step4list) . ')'; # <-- Use this instead if this list is preorder for optimum search speed, seems logical to order alphabetically or even regex compress, but I'm too lazy -- Eg: # --> $STEP4 = qr( a(?:ble|l|nce|te) | i(?:ble|c|sm|ti|ve|ze) ... )x; # Then define stem($word) to stem $word: sub stem { my ($w); #== Handle arrays, references, etc.: =======================================# # - Shouldn't add significant overhead # - Overhead should be more than made up for when calling with long lists # - Please inform of issues with this handler my @word = (); foreach (@_) { if (my $ref = ref) { if ($ref eq 'ARRAY') { # 2: Second most likely case: Array Ref foreach $w(@{$_}) { if (not ref $w) { push @word, lc $w; } else { stem($w); # 4: Deep placed reference, go deep warn "Deep reference in stem, going deep..."; } } } elsif ($ref eq 'SCALAR') { push @word, lc ${$_}; # 3: Reference to scalar } else { die "Unsupported reference of type '$ref' passed to stem."; } } else { push @word, lc; # 1: Most likely case: Scalar, Array } } # Or use: map { push @word, lc } @_; return undef if not @word; #====================================================== -- Allan Fields ====# foreach (@word) { my ($stem, $suffix); next if length($_) < 3; # length at least 3 for stemming # Prelude - Map initial y to Y so that the patterns never treat it as vowel: s/^y/Y/; # Step 1a: if (/(ss|i)es$/) { $_ = $` . $1 } elsif (/([^s])s$/) { $_ = $` . $1 } # Step 1b: if (/eed$/) { if ($` =~ /$mgr0/o) { chop } } elsif (/(?:ed|ing)$/) { $stem = $`; if ($stem =~ /$_v/o) { $_ = $stem; if (/(?:at|bl|iz)$/) { $_ .= 'e' } elsif (/([^aeiouylsz])\1$/) { chop } elsif (/^${C}${v}[^aeiouwxy]$/o) { $_ .= 'e' } } } # Step 1c: if (/y$/) { $stem = $`; $_ = "${stem}i" if ($stem =~ /$_v/o); } # Step 2: if (/$STEP2$/o) { $stem = $`; $suffix = $1; $_ = $stem . $step2list{$suffix} if ($stem =~ /$mgr0/o); } # Step 3: if (/$STEP3$/o) { $stem = $`; $suffix = $1; $_ = $stem . $step3list{$suffix} if ($stem =~ /$mgr0/o); } # Step 4: if (/$STEP4$/o) { $stem = $`; $_ = $stem if ($stem =~ /$mgr1/o); } elsif (/(s|t)ion$/) { $stem = $` . $1; $_ = $stem if ($stem =~ /$mgr1/o); } # Step 5: if (/e$/) { $stem = $`; $_ = $stem if ( $stem =~ /$mgr1/o or ( $stem =~ /$meq1/o and not $stem =~ /^ $C $v [^aeiouwxy] $/xo ) ); } chop if (/ll$/ and /$mgr1/o); # Postlude - Turn initial Y back to y: s/^Y/y/; } return wantarray? @word : $word[0]; } # Read in words and stem to stdout: while (<>) { while (/\G(.*?)([$VALID]+)/cog) { print "$1", (defined $2? stem($2):''); } print "\n"; } __END__ # Test array, reference handling code: print "Results:\n"; print " - $_\n" for stem(( 'hi', [ 'debate', 'debunked', 'there', 'walking' ], 'talking', \'baking', ('hardly', 'negotiate', 'anticipation'), 'rationalize' ) ); # Some other input methods to try: print "Enter words: \n\n"; while (<>) { if (my @word = /[$VALID]+/cog) { # One per line #foreach (@word) { # print " - $_ => "; # my $result = stem($_); # print '',(defined $result? "$result":'(undef)'),"\n"; #} # Many per line w/ benchmark my @result; print "Stemming: ",(join ', ', @word),"...\n"; my $count = 10000; my $t = timeit($count, sub { @result = stem(@word); }); print "Result: ",(join ', ', @result),".\n"; warn "$count loops of stemming code took:",timestr($t),"\n"; print "\n"; } } =head1 HISTORY perl.txt: =over 4 =item - Release 1 of Porter Stemmer: by Martin Porter. =item - Code changes, litte/no algorithm changes: Allan Fields, 2002. Will submit any algorithm suggestions as part of snowball project instead. =back =head1 SEE ALSO =over 4 =item * Porter, M.F., "An algorithm for suffix stripping", Program, Vol. 14 (3), July 1980, pp 130-137. =item * http://www.tartarus.org/~martin/PorterStemmer - Martin Porter's official Porter Stemmer website =item * http://snowball.sourceforge.net/ - Snowball project, newest stemmers. Multilingual stemmers written in snowball. =back =cut