Wednesday, March 18, 2015

1980 Porter Stemmer in Awk

################################################################################ 
#  This is the Porter stemming algorithm, coded up in awk by Gregory Grefenstette
#  July 5, 2012
#  It follows the  algorithm presented in
#
#  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
#  no. 3, pp 130-137,
#
# and more precisely the code for the ANSI C version found at
#
#  http://www.tartarus.org/~martin/PorterStemmer
#
# This endioding of the algorthm can be used free of charge for any purpose
#
# TRUE if last two characters are a double consonant
#
# Temporary Modification 20150318 by AMDanischewski 
#  -- Changed step1ab to also clip -er and -est, this breaks many words 
#  but it fixes more than it breaks, so I put it on there. Prior to this 
#  words like lighter and lightest won't be stemmed.   
#
# Issues: -er, -est are not stemmed properly 
#         words ending in -y are brokenly returned as ending -i 
# 
################################################################################ 

function doublec(s)
{ if(substr(s,length(s),1) == substr(s,length(s)-1,1) && (substr(s,length(s),1)  !~ /[aeiou]/)) return 1;
                                                         else return 0 }

# antyhing other than one of a, e, i, o, u, for the case of "y" it checks if
# there is a preceding vowel, in which case it is considered a consonant

function cons(str,i)
{ if(substr(str,i,1) ~ /[aeiou]/) return 0 ;
  if(i==1) return 1;
  if(substr(str,i,1) == "y") { if(substr(str,i-1,1)  ~ /[aeiou]/) return 1; else return 0 }
  return 1
}

# cvc(i) is TRUE if last three characters of str are  consonant - vowel - consonant
# and also if the second c is not w,x or y. this is used when trying to
#   restore an e at the end of a short word. e.g.
#
#  cav(e), lov(e), hop(e), crim(e), but
#  snow, box, tray.

function cvc(str) {
  if(length(str) <= 2) return 0;
  if( str ~ /[wxy]$/ ) return 0;
  if ( cons(str,length(str)-2) && !cons(str,length(str)-1) &&  cons(str,length(str)) ) return 1;
  return 0
}

# m() measures the number of consonant sequences between k0 and j. if c is
# a consonant sequence and v a vowel sequence, and <..> indicates optional
#
#      <c><v>       gives 0
#      <c>vc<v>     gives 1
#      <c>vcvc<v>   gives 2
#      <c>vcvcvc<v> gives 3
#      ....
# this version returns "2" as the maximum value

function m(str) {
  # skip initial consonants
  mreturns=0;
  mindex=1;
  while((mindex <= length-str) && cons(str,mindex)) mindex++ ;
  while (1)
    { while(1)
        { if (mindex > length(str)) return mreturns;
          if( cons(str,mindex) ) break;
          mindex++
        }
      mindex++;
      mreturns++;
      if(mreturns > 2) return mreturns;
      while(1)
        { if (mindex > length(str)) return mreturns;
          if( ! cons(str,mindex) ) break;
          mindex++
        }
    }
}

# step1ab() gets rid of plurals and -ed or -ing. e.g.
#
#      caresses  ->  caress
#      ponies    ->  poni
#      ties      ->  ti
#      caress    ->  caress
#      cats      ->  cat
#
#      feed      ->  feed
#      agreed    ->  agree
#      disabled  ->  disable
#
#      matting   ->  mat
#      mating    ->  mate
#      meeting   ->  meet
#      milling   ->  mill
#      messing   ->  mess
#
#      meetings  ->  meet
function step1ab(str) {

if(str ~ /sses$/ || str ~ /ies$/ ) str=substr(str,1,length(str)-2) ;
else if(str ~ /ss$/) ;
else if (str ~ /s$/) str=substr(str,1,length(str)-1) ;

if(str ~ /eed$/) { if(m(substr(str,1,length(str)-3))>0 ) str=substr(str,1,length(str)-1) ; }
else {trunc=0;
    if (str ~ /[aeiouy].*ed$/) trunc=2;
      ## Begin temporary modification by AMDanischewski - 20150318  
      ##  Added the following two entries to fix est and er.  
      ##  Without this words like lightest and lighter won't be stemmed. 
      ##  So this needs to be tweaked furth =) .. 
    if (str ~ /[aeiouy].*er$/) trunc=2;   
    if (str ~ /[aeiouy].*est$/) trunc=3; 
    if (str ~ /[aeiouy].*ing$/) trunc=3 ;
    if(trunc>0) { str=substr(str,1,length(str)-trunc) ;
    if(str ~ /(at|bl|iz)$/) str=str"e" ;
    else
     if (doublec(str)==1 && (str !~ /[lsz]$/)) {  str=substr(str,1,length(str)-1); }
     else if( m(str)==1 && cvc(str)) str=str"e";
    }}
    return str }

# step1c() turns terminal y to i when there is another vowel in the stem.

function step1c(str) {
if(str ~/[aeiouy].*y$/) str=substr(str,1,length(str)-1)"i" ;
 return str }

# step2() maps double suffices to single ones. so -ization ( = -ize plus
#   -ation) maps to -ize etc. note that the string before the suffix must give
#           m() > 0. */

function step2(str) {
  if( str ~ /[aeiouy][^aeiouy].*ational$/ ) str=substr(str,1,length(str)-5)"e" ;
  else if ( str ~ /[aeiouy][^aeiou].*tional$/ ) str=substr(str,1,length(str)-2) ;
  else if ( str ~ /[aeiouy][^aeiou].*[ae]nci$/ ) str=substr(str,1,length(str)-1)"e" ;
  else if ( str ~ /[aeiouy][^aeiou].*izer$/ ) str=substr(str,1,length(str)-1) ;
  else if ( str ~ /[aeiouy][^aeiou].*bli$/ ) str=substr(str,1,length(str)-1)"e" ;
  else if ( str ~ /[aeiouy][^aeiou].*alli$/ ) str=substr(str,1,length(str)-2);
  else if ( str ~ /[aeiouy][^aeiou].*entli$/ ) str=substr(str,1,length(str)-2);
  else if ( str ~ /[aeiouy][^aeiou].*eli$/ ) str=substr(str,1,length(str)-2);
  else if ( str ~ /[aeiouy][^aeiou].*ousli$/ ) str=substr(str,1,length(str)-2);
  else if ( str ~ /[aeiouy][^aeiou].*ization$/ ) str=substr(str,1,length(str)-5)"e" ;
  else if ( str ~ /[aeiouy][^aeiou].*ation$/ ) str=substr(str,1,length(str)-3)"e" ;
  else if ( str ~ /[aeiouy][^aeiou].*ator$/ ) str=substr(str,1,length(str)-2)"e" ;
  else if ( str ~ /[aeiouy][^aeiou].*alism$/ ) str=substr(str,1,length(str)-3);
  else if ( str ~ /[aeiouy][^aeiou].*iveness$/ ) str=substr(str,1,length(str)-4) ;
  else if ( str ~ /[aeiouy][^aeiou].*fulness$/ ) str=substr(str,1,length(str)-4) ;
  else if ( str ~ /[aeiouy][^aeiou].*ousness$/ ) str=substr(str,1,length(str)-4) ;
  else if ( str ~ /[aeiouy][^aeiou].*aliti$/ ) str=substr(str,1,length(str)-3) ;
  else if ( str ~ /[aeiouy][^aeiou].*iviti$/ ) str=substr(str,1,length(str)-3)"e" ;
  else if ( str ~ /[aeiouy][^aeiou].*biliti$/ ) str=substr(str,1,length(str)-5)"le" ;
  else if ( str ~ /[aeiouy][^aeiou].*logi$/ ) str=substr(str,1,length(str)-1) ;
 return str }

# step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
function step3(str) {
  if( str ~ /[aeiouy][^aeiouy].*icate$/ ) str=substr(str,1,length(str)-3) ;
  else if ( str ~ /[aeiouy][^aeiou].*ative$/ ) str=substr(str,1,length(str)-5);
  else if ( str ~ /[aeiouy][^aeiou].*alize$/ ) str=substr(str,1,length(str)-3) ;
  else if ( str ~ /[aeiouy][^aeiou].*iciti$/ ) str=substr(str,1,length(str)-3) ;
  else if ( str ~ /[aeiouy][^aeiou].*ical$/ ) str=substr(str,1,length(str)-2) ;
  else if ( str ~ /[aeiouy][^aeiou].*ful$/ ) str=substr(str,1,length(str)-3);
  else if ( str ~ /[aeiouy][^aeiou].*ness$/ ) str=substr(str,1,length(str)-4);
return str }

# step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
function step4(str) {
  if( str ~ /al$/ ) { if ( m(substr(str,1,length(str)-2)) > 1 )  str=substr(str,1,length(str)-2) }
  else if ( str ~ /[ae]nce$/ ) { if ( m(substr(str,1,length(str)-4)) > 1) str=substr(str,1,length(str)-4) }
  else if ( str ~ /(er|ic)$/ ) { if (  m(substr(str,1,length(str)-2)) > 1 ) str=substr(str,1,length(str)-2) }
  else if ( str ~ /[ai]ble$/ ) { if (  m(substr(str,1,length(str)-4)) > 1 ) str=substr(str,1,length(str)-4) }
  else if ( str ~ /ant$/ ) { if (  m(substr(str,1,length(str)-3)) > 1 )  str=substr(str,1,length(str)-3) }
  else if ( str ~ /ement$/) { if(  m(substr(str,1,length(str)-5)) > 1 ) str=substr(str,1,length(str)-5) }
  else if ( str ~ /ment$/) { if (  m(substr(str,1,length(str)-4)) > 1 ) str=substr(str,1,length(str)-4) }
  else if ( str ~ /ent$/) { if (  m(substr(str,1,length(str)-3)) > 1 ) str=substr(str,1,length(str)-3) }
  else if ( str ~ /[st]ion$/) { if (  m(substr(str,1,length(str)-3)) > 1 )  str=substr(str,1,length(str)-3) }
  else if ( str ~ /ou$/) { if (  m(substr(str,1,length(str)-2)) > 1 )  str=substr(str,1,length(str)-2) }
  else if ( str ~ /(ism|ate|iti|ous|ive|ize)$/) { if (  m(substr(str,1,length(str)-3)) > 1 )  str=substr(str,1,length(str)-3)
}
return str}

# step5() removes a final -e if m() > 1, and changes -ll to -l if
#  m() > 1.
function step5(str) {
  if ( str ~ /e$/ && ( m(str)>1 || (m(str)==1  && !cvc(substr(str,1,length(str)-1)))))  str=substr(str,1,length(str)-1) ;
  if( str ~ /ll$/ && m(str)>1 ) str=substr(str,1,length(str)-1) ;
 return str }

function stem(str)
{ str=tolower(str);
  if(length(str)<=2) return str;
  str=step1ab(str);
  str=step1c(str);
  str=step2(str);
  str=step3(str);
  str=step4(str);
  str=step5(str);
return str
}

# main
{  printf("%s",stem($1));
   for(i=2;i<=NF;i++) printf("%s%s",FS,stem($i));
   print ""
}
If you save the previous to a file in your path you can add an alias like this:
alias stem='awk -f /path/to/stem.awk <<< '

Now you should be able to stem any word like this: 
~$ stem shopping
shop

No comments:

Post a Comment