/* The Porter Stemming Algorithm - Brian D Steel 02 Apr 20 / 03 Apr 20 =================================================================== This BDS-PROLOG code is based largely on a Python implementation of the Porter Stemming Algorithm, as shown on this web page: iq.opengenus.org/porter-stemmer/ However, the Python code was found to contain a number of bugs, and appropriate fixes were applied as suggested on this web page: tartarus.org/martin/PorterStemmer/ The present implementation has been tested on WIN-PROLOG 7.1, using the standard test vocabulary in the file "voc.txt", creating output that is identical to "output.txt", both files which were downloaded from the above website. To run the stemmer on a block of text, call porter/2: ?- porter( `Wondering at the Wonders of a Wonderful World`, X ). X = [`wonder`,`at`,`the`,`wonder`,`of`,`a`,`wonder`,`world`] To run the stemmer on a single word, call stem/2: ?- stem( `complication`, X ). X = `complic` */ % perform porter stemming on a sentence, returning a list of stems porter( String, List ) :- sanitise( String, Stretch ), tokenise( Stretch, Tokens ), findall( Stem, ( member( Word, Tokens ), stem( Word, Stem ) ), List ). % perform porter stemming on a single word stem( Word, Reps ) :- ( len( Word, Length ), Length > 2 -> lwrupr( Lower, Word ), step_1a( Lower, Part1a ), step_1b( Part1a, Part1b ), step_1c( Part1b, Part1c ), step_2( Part1c, Part2 ), step_3( Part2, Part3 ), step_4( Part3, Part4 ), step_5a( Part4, Part5a ), step_5b( Part5a, Part5b ), Reps = Part5b ; Reps = Word ). % algorithm step 1a step_1a( Word, Reps ) :- ( ends_with( Word, _, `sses` ) -> replace( Word, `sses`, `ss`, Reps ) ; ends_with( Word, _, `ies` ) -> replace( Word, `ies`, `i`, Reps ) ; ends_with( Word, _, `ss` ) -> replace( Word, `ss`, `ss`, Reps ) ; ends_with( Word, _, `s` ) -> replace( Word, `s`, ``, Reps ) ; Reps = Word ). % algorithm step 1b step_1b( Word, Reps ) :- ( ends_with( Word, Base, `eed` ) -> ( get_m( Base, Mval ), Mval > 0 -> cat( [Base,`ee`], Part, _ ) ; Part = Word ), Flag = fail ; ends_with( Word, Base, `ed` ) -> ( contains_vowel( Base ) -> Part = Base, Flag = true ; Part = Word, Flag = fail ) ; ends_with( Word, Base, `ing` ) -> ( contains_vowel( Base ) -> Part = Base, Flag = true ; Part = Word, Flag = fail ) ; Part = Word, Flag = fail ), ( Flag -> ( ( ends_with( Part, _, `at` ) ; ends_with( Part, _, `bl` ) ; ends_with( Part, _, `iz` ) ) -> cat( [Part,`e`], More, _ ) ; double_cons( Part ), \+ ends_with( Part, _, `l` ), \+ ends_with( Part, _, `s` ), \+ ends_with( Part, _, `z` ) -> len( Part, Length ), Less is Length - 1, cat( List, Part, [Less] ), List = [More,_] ; get_m( Part, 1 ), cvc( Part ) -> cat( [Part,`e`], More, _ ) ; More = Part ) ; More = Part ), Reps = More. % algorithm step 1c step_1c( Word, Reps ) :- ( ends_with( Word, Base, `y` ), contains_vowel( Base ) -> cat( [Base,`i`], Part, _ ) ; Part = Word ), Reps = Part. % algorithm step 2 step_2( Word, Reps ) :- ( ends_with( Word, _, `logi` ) -> replace_m0( Word, `logi`, `log`, Part ) ; ends_with( Word, _, `ational` ) -> replace_m0( Word, `ational`, `ate`, Part ) ; ends_with( Word, _, `tional` ) -> replace_m0( Word, `tional`, `tion`, Part ) ; ends_with( Word, _, `enci` ) -> replace_m0( Word, `enci`, `ence`, Part ) ; ends_with( Word, _, `anci` ) -> replace_m0( Word, `anci`, `ance`, Part ) ; ends_with( Word, _, `izer` ) -> replace_m0( Word, `izer`, `ize`, Part ) ; ends_with( Word, _, `bli` ) -> replace_m0( Word, `bli`, `ble`, Part ) ; ends_with( Word, _, `alli` ) -> replace_m0( Word, `alli`, `al`, Part ) ; ends_with( Word, _, `entli` ) -> replace_m0( Word, `entli`, `ent`, Part ) ; ends_with( Word, _, `eli` ) -> replace_m0( Word, `eli`, `e`, Part ) ; ends_with( Word, _, `ousli` ) -> replace_m0( Word, `ousli`, `ous`, Part ) ; ends_with( Word, _, `ization` ) -> replace_m0( Word, `ization`, `ize`, Part ) ; ends_with( Word, _, `ation` ) -> replace_m0( Word, `ation`, `ate`, Part ) ; ends_with( Word, _, `ator` ) -> replace_m0( Word, `ator`, `ate`, Part ) ; ends_with( Word, _, `alism` ) -> replace_m0( Word, `alism`, `al`, Part ) ; ends_with( Word, _, `iveness` ) -> replace_m0( Word, `iveness`, `ive`, Part ) ; ends_with( Word, _, `fulness` ) -> replace_m0( Word, `fulness`, `ful`, Part ) ; ends_with( Word, _, `ousness` ) -> replace_m0( Word, `ousness`, `ous`, Part ) ; ends_with( Word, _, `aliti` ) -> replace_m0( Word, `aliti`, `al`, Part ) ; ends_with( Word, _, `iviti` ) -> replace_m0( Word, `iviti`, `ive`, Part ) ; ends_with( Word, _, `biliti` ) -> replace_m0( Word, `biliti`, `ble`, Part ) ; Part = Word ), Reps = Part. % algorithm step 3 step_3( Word, Reps ) :- ( ends_with( Word, _, `icate` ) -> replace_m0( Word, `icate`, `ic`, Part ) ; ends_with( Word, _, `ative` ) -> replace_m0( Word, `ative`, ``, Part ) ; ends_with( Word, _, `alize` ) -> replace_m0( Word, `alize`, `al`, Part ) ; ends_with( Word, _, `iciti` ) -> replace_m0( Word, `iciti`, `ic`, Part ) ; ends_with( Word, _, `ical` ) -> replace_m0( Word, `ical`, `ic`, Part ) ; ends_with( Word, _, `ful` ) -> replace_m0( Word, `ful`, ``, Part ) ; ends_with( Word, _, `ness` ) -> replace_m0( Word, `ness`, ``, Part ) ; Part = Word ), Reps = Part. % algorithm step 4 step_4( Word, Reps ) :- ( ends_with( Word, _, `al` ) -> replace_m1( Word, `al`, ``, Part ) ; ends_with( Word, _, `ance` ) -> replace_m1( Word, `ance`, ``, Part ) ; ends_with( Word, _, `ence` ) -> replace_m1( Word, `ence`, ``, Part ) ; ends_with( Word, _, `er` ) -> replace_m1( Word, `er`, ``, Part ) ; ends_with( Word, _, `ic` ) -> replace_m1( Word, `ic`, ``, Part ) ; ends_with( Word, _, `able` ) -> replace_m1( Word, `able`, ``, Part ) ; ends_with( Word, _, `ible` ) -> replace_m1( Word, `ible`, ``, Part ) ; ends_with( Word, _, `ant` ) -> replace_m1( Word, `ant`, ``, Part ) ; ends_with( Word, _, `ement` ) -> replace_m1( Word, `ement`, ``, Part ) ; ends_with( Word, _, `ment` ) -> replace_m1( Word, `ment`, ``, Part ) ; ends_with( Word, _, `ent` ) -> replace_m1( Word, `ent`, ``, Part ) ; ends_with( Word, _, `ou` ) -> replace_m1( Word, `ou`, ``, Part ) ; ends_with( Word, _, `ism` ) -> replace_m1( Word, `ism`, ``, Part ) ; ends_with( Word, _, `ate` ) -> replace_m1( Word, `ate`, ``, Part ) ; ends_with( Word, _, `iti` ) -> replace_m1( Word, `iti`, ``, Part ) ; ends_with( Word, _, `ous` ) -> replace_m1( Word, `ous`, ``, Part ) ; ends_with( Word, _, `ive` ) -> replace_m1( Word, `ive`, ``, Part ) ; ends_with( Word, _, `ize` ) -> replace_m1( Word, `ize`, ``, Part ) ; ends_with( Word, Base, `ion` ) -> ( get_m( Base, Mval ), Mval > 1, ( ends_with( Base, _, `s` ) ; ends_with( Base, _, `t` ) ) -> Part = Base ; Part = Word ) ; Part = Word ), Reps = Part. % algorithm step 5a step_5a( Word, Reps ) :- ( ends_with( Word, Base, `e` ) -> ( get_m( Base, Mval ), Mval > 1 -> Part = Base ; get_m( Base, 1 ), \+ cvc( Base ) -> Part = Base ; Part = Word ) ; Part = Word ), Reps = Part. % algorithm step 5b step_5b( Word, Reps ) :- ( get_m( Word, Mval ), Mval > 1, ends_with( Word, _, `ll` ) -> replace( Word, `ll`, `l`, Part ) ; Part = Word ), Reps = Part. % single letter is a consonant is_cons( Letter ) :- ( member( Letter, [`a`,`e`,`i`,`o`,`u`] ) -> fail ; true ). % decide if "y" is a consonant is_consonant( Word, Index ) :- extract( Word, Index, Letter ), ( is_cons( Letter ) -> ( Letter = `y`, extract( Word, Index-1, Previous ), is_cons( Previous ) -> fail ; true ) ; fail ). % it's a vowel if it's not a consonant is_vowel( Word, Index ) :- \+ is_consonant( Word, Index ). % test word ending, returning base if a match ends_with( Stem, Base, Ends ) :- len( Stem, Length ), len( Ends, Short ), Length > Short, Split is Length - Short, cat( List, Stem, [Split] ), List = [Base,Ends]. % see if a stem contains a vowel contains_vowel( Stem ) :- len( Stem, Length ), ( repeat( Length, Index ), extract( Stem, Index, Letter ), ( \+ is_cons( Letter ) -> true ; Letter = `y` -> true ; fail ) ; fail ). % see if a stem ends in a double consonant double_cons( Stem ) :- len( Stem, Length ), ( Length > 2 -> ( extract( Stem, Length, Letter ), extract( Stem, Length-1, Letter ), is_cons( Letter ) -> true ; fail ) ; fail ). % get the form of a word get_form( Word, Form ) :- Prev = ``, len( Word, Length ), ( repeat( Length, Index ), ( is_consonant( Word, Index ) -> ( Prev \= `C` -> swrite( `C` ), asn( [Prev], `C` ), fail ) ; ( Prev \= `V` -> swrite( `V` ), asn( [Prev], `V` ), fail ) ) ; true ) ~> Text, Form = Text. % get the measure of a word get_m( Word, Mval ) :- get_form( Word, Form ), Temp = 0, ( repeat, find( `VC`, 0, Find ), ( Find = `VC` -> More is Temp + 1, asn( [Temp], More ), fail ; true ) -> Mval = Temp ) <~ Form. % see of a word ends in consonant, vowel, consonant cvc( Word ) :- len( Word, Length ), ( Length >= 3 -> ( is_consonant( Word, Length ), is_vowel( Word, Length-1 ), is_consonant( Word, Length-2 ), extract( Word, Length, Letter ), \+ member( Letter, [`w`,`x`,`y`] ) -> true ; fail ) ; fail ). % replace the ending of a word replace( Orig, Rem, Rep, Reps ) :- len( Orig, Length ), len( Rem, Short ), Length > Short, Split is Length - Short, cat( List, Orig, [Split] ), ( List = [Base,Rem] -> cat( [Base,Rep], Text, _ ) ; Text = Rep ), Reps = Text. % replace the ending of a word where m > 0 replace_m0( Orig, Rem, Rep, Reps ) :- len( Orig, Length ), len( Rem, Short ), Length > Short, Split is Length - Short, cat( List, Orig, [Split] ), ( List = [Base,Rem], get_m( Base, Mval ), Mval > 0 -> cat( [Base,Rep], Text, _ ) ; Text = Orig ), Reps = Text. % replace the ending of a word where m > 1 replace_m1( Orig, Rem, Rep, Reps ) :- len( Orig, Length ), len( Rem, Short ), Length > Short, Split is Length - Short, cat( List, Orig, [Split] ), ( List = [Base,Rem], get_m( Base, Mval ), Mval > 1 -> cat( [Base,Rep], Text, _ ) ; Text = Orig ), Reps = Text. % extract a given letter from a word extract( Word, Index, Letter ) :- Split is Index - 1, cat( List, Word, [Split,1] ), List = [_,Letter,_]. % sanitise a sentence isolating any non-alphabetic characters sanitise( String, Stretch ) :- len( String, Length ), ( repeat( Length, Index ), extract( String, Index, Letter ), ( ( cmp( -1, Letter, `A` ) ; cmp( 1, Letter, `Z` ), cmp( -1, Letter, `a` ) ; cmp( 1, Letter, `z` ) ) -> swrite( ` ` ), swrite( Letter ), swrite( ` ` ) ; swrite( Letter ) ), Index = Length -> true ) ~> Text, Stretch = Text. % tokenise a sentence into a list (taken from strings.pl) tokenise( String, Tokens ) :- findall( Token, ( repeat, find( ``, 0, Find ), ( Find = `` -> !, fail ; scan( [], 2, _ ) ~> Token ) ) <~ String, Tokens ).