/*
   The Porter Stemming Algorithm - Brian D Steel 02 Apr 20 / 03 Apr 20
   ===================================================================

   This BDS-PROLOG code is based largely on a Python implementation of
   the Porter Stemming Algorithm, as shown on this web page:

      iq.opengenus.org/porter-stemmer/

   However, the Python code was found to contain a number of bugs, and
   appropriate fixes were applied as suggested on this web page:

      tartarus.org/martin/PorterStemmer/

   The present implementation has been tested on WIN-PROLOG 7.1, using
   the standard test vocabulary in the file "voc.txt", creating output
   that is identical to "output.txt", both files which were downloaded
   from the above website.

   To run the stemmer on a block of text, call porter/2:

      ?- porter( `Wondering at the Wonders of a Wonderful World`, X ).
      X = [`wonder`,`at`,`the`,`wonder`,`of`,`a`,`wonder`,`world`]

   To run the stemmer on a single word, call stem/2:

      ?- stem( `complication`, X ).
      X = `complic`
*/

% perform porter stemming on a sentence, returning a list of stems

porter( String, List ) :-
   sanitise( String, Stretch ),
   tokenise( Stretch, Tokens ),
   findall( Stem,
            (  member( Word, Tokens ),
               stem( Word, Stem )
            ),
            List
          ).

% perform porter stemming on a single word

stem( Word, Reps ) :-
   (  len( Word, Length ),
      Length > 2
   -> lwrupr( Lower, Word ),
      step_1a( Lower, Part1a ),
      step_1b( Part1a, Part1b ),
      step_1c( Part1b, Part1c ),
      step_2( Part1c, Part2 ),
      step_3( Part2, Part3 ),
      step_4( Part3, Part4 ),
      step_5a( Part4, Part5a ),
      step_5b( Part5a, Part5b ),
      Reps = Part5b
   ;  Reps = Word
   ).

% algorithm step 1a

step_1a( Word, Reps ) :-
   (  ends_with( Word, _, `sses` )
   -> replace( Word, `sses`, `ss`, Reps )
   ;  ends_with( Word, _, `ies` )
   -> replace( Word, `ies`, `i`, Reps )
   ;  ends_with( Word, _, `ss` )
   -> replace( Word, `ss`, `ss`, Reps )
   ;  ends_with( Word, _, `s` )
   -> replace( Word, `s`, ``, Reps )
   ;  Reps = Word
   ).

% algorithm step 1b

step_1b( Word, Reps ) :-
   (  ends_with( Word, Base, `eed` )
   -> (  get_m( Base, Mval ),
         Mval > 0
      -> cat( [Base,`ee`], Part, _ )
      ;  Part = Word
      ),
      Flag = fail
   ;  ends_with( Word, Base, `ed` )
   -> (  contains_vowel( Base )
      -> Part = Base,
         Flag = true
      ;  Part = Word,
         Flag = fail
      )
   ;  ends_with( Word, Base, `ing` )
   -> (  contains_vowel( Base )
      -> Part = Base,
         Flag = true
      ;  Part = Word,
         Flag = fail
      )
   ;  Part = Word,
      Flag = fail
   ),
   (  Flag
   -> (  (  ends_with( Part, _, `at` )
         ;  ends_with( Part, _, `bl` )
         ;  ends_with( Part, _, `iz` )
         )
      -> cat( [Part,`e`], More, _ )
      ;  double_cons( Part ),
         \+ ends_with( Part, _, `l` ),
         \+ ends_with( Part, _, `s` ),
         \+ ends_with( Part, _, `z` )
      -> len( Part, Length ),
         Less is Length - 1,
         cat( List, Part, [Less] ),
         List = [More,_]
      ;  get_m( Part, 1 ),
         cvc( Part )
      -> cat( [Part,`e`], More, _ )
      ;  More = Part
      )
   ;  More = Part
   ),
   Reps = More.

% algorithm step 1c

step_1c( Word, Reps ) :-
   (  ends_with( Word, Base, `y` ),
      contains_vowel( Base )
   -> cat( [Base,`i`], Part, _ )
   ;  Part = Word
   ),
   Reps = Part.

% algorithm step 2

step_2( Word, Reps ) :-
   (  ends_with( Word, _, `logi` )
   -> replace_m0( Word, `logi`, `log`, Part )
   ;  ends_with( Word, _, `ational` )
   -> replace_m0( Word, `ational`, `ate`, Part )
   ;  ends_with( Word, _, `tional` )
   -> replace_m0( Word, `tional`, `tion`, Part )
   ;  ends_with( Word, _, `enci` )
   -> replace_m0( Word, `enci`, `ence`, Part )
   ;  ends_with( Word, _, `anci` )
   -> replace_m0( Word, `anci`, `ance`, Part )
   ;  ends_with( Word, _, `izer` )
   -> replace_m0( Word, `izer`, `ize`, Part )
   ;  ends_with( Word, _, `bli` )
   -> replace_m0( Word, `bli`, `ble`, Part )
   ;  ends_with( Word, _, `alli` )
   -> replace_m0( Word, `alli`, `al`, Part )
   ;  ends_with( Word, _, `entli` )
   -> replace_m0( Word, `entli`, `ent`, Part )
   ;  ends_with( Word, _, `eli` )
   -> replace_m0( Word, `eli`, `e`, Part )
   ;  ends_with( Word, _, `ousli` )
   -> replace_m0( Word, `ousli`, `ous`, Part )
   ;  ends_with( Word, _, `ization` )
   -> replace_m0( Word, `ization`, `ize`, Part )
   ;  ends_with( Word, _, `ation` )
   -> replace_m0( Word, `ation`, `ate`, Part )
   ;  ends_with( Word, _, `ator` )
   -> replace_m0( Word, `ator`, `ate`, Part )
   ;  ends_with( Word, _, `alism` )
   -> replace_m0( Word, `alism`, `al`, Part )
   ;  ends_with( Word, _, `iveness` )
   -> replace_m0( Word, `iveness`, `ive`, Part )
   ;  ends_with( Word, _, `fulness` )
   -> replace_m0( Word, `fulness`, `ful`, Part )
   ;  ends_with( Word, _, `ousness` )
   -> replace_m0( Word, `ousness`, `ous`, Part )
   ;  ends_with( Word, _, `aliti` )
   -> replace_m0( Word, `aliti`, `al`, Part )
   ;  ends_with( Word, _, `iviti` )
   -> replace_m0( Word, `iviti`, `ive`, Part )
   ;  ends_with( Word, _, `biliti` )
   -> replace_m0( Word, `biliti`, `ble`, Part )
   ;  Part = Word
   ),
   Reps = Part.

% algorithm step 3

step_3( Word, Reps ) :-
   (  ends_with( Word, _, `icate` )
   -> replace_m0( Word, `icate`, `ic`, Part )
   ;  ends_with( Word, _, `ative` )
   -> replace_m0( Word, `ative`, ``, Part )
   ;  ends_with( Word, _, `alize` )
   -> replace_m0( Word, `alize`, `al`, Part )
   ;  ends_with( Word, _, `iciti` )
   -> replace_m0( Word, `iciti`, `ic`, Part )
   ;  ends_with( Word, _, `ical` )
   -> replace_m0( Word, `ical`, `ic`, Part )
   ;  ends_with( Word, _, `ful` )
   -> replace_m0( Word, `ful`, ``, Part )
   ;  ends_with( Word, _, `ness` )
   -> replace_m0( Word, `ness`, ``, Part )
   ;  Part = Word
   ),
   Reps = Part.

% algorithm step 4

step_4( Word, Reps ) :-
   (  ends_with( Word, _, `al` )
   -> replace_m1( Word, `al`, ``, Part )
   ;  ends_with( Word, _, `ance` )
   -> replace_m1( Word, `ance`, ``, Part )
   ;  ends_with( Word, _, `ence` )
   -> replace_m1( Word, `ence`, ``, Part )
   ;  ends_with( Word, _, `er` )
   -> replace_m1( Word, `er`, ``, Part )
   ;  ends_with( Word, _, `ic` )
   -> replace_m1( Word, `ic`, ``, Part )
   ;  ends_with( Word, _, `able` )
   -> replace_m1( Word, `able`, ``, Part )
   ;  ends_with( Word, _, `ible` )
   -> replace_m1( Word, `ible`, ``, Part )
   ;  ends_with( Word, _, `ant` )
   -> replace_m1( Word, `ant`, ``, Part )
   ;  ends_with( Word, _, `ement` )
   -> replace_m1( Word, `ement`, ``, Part )
   ;  ends_with( Word, _, `ment` )
   -> replace_m1( Word, `ment`, ``, Part )
   ;  ends_with( Word, _, `ent` )
   -> replace_m1( Word, `ent`, ``, Part )
   ;  ends_with( Word, _, `ou` )
   -> replace_m1( Word, `ou`, ``, Part )
   ;  ends_with( Word, _, `ism` )
   -> replace_m1( Word, `ism`, ``, Part )
   ;  ends_with( Word, _, `ate` )
   -> replace_m1( Word, `ate`, ``, Part )
   ;  ends_with( Word, _, `iti` )
   -> replace_m1( Word, `iti`, ``, Part )
   ;  ends_with( Word, _, `ous` )
   -> replace_m1( Word, `ous`, ``, Part )
   ;  ends_with( Word, _, `ive` )
   -> replace_m1( Word, `ive`, ``, Part )
   ;  ends_with( Word, _, `ize` )
   -> replace_m1( Word, `ize`, ``, Part )
   ;  ends_with( Word, Base, `ion` )
   -> (  get_m( Base, Mval ),
         Mval > 1,
         (  ends_with( Base, _, `s` )
         ;  ends_with( Base, _, `t` )
         )
      -> Part = Base
      ;  Part = Word
      )
   ;  Part = Word
   ),
   Reps = Part.

% algorithm step 5a

step_5a( Word, Reps ) :-
   (  ends_with( Word, Base, `e` )
   -> (  get_m( Base, Mval ),
         Mval > 1
      -> Part = Base
      ;  get_m( Base, 1 ),
         \+ cvc( Base )
      -> Part = Base
      ;  Part = Word
      )
   ;  Part = Word
   ),
   Reps = Part.

% algorithm step 5b

step_5b( Word, Reps ) :-
   (  get_m( Word, Mval ),
      Mval > 1,
      ends_with( Word, _, `ll` )
   -> replace( Word, `ll`, `l`, Part )
   ;  Part = Word
   ),
   Reps = Part.

% single letter is a consonant

is_cons( Letter ) :-
   (  member( Letter, [`a`,`e`,`i`,`o`,`u`] )
   -> fail
   ;  true
   ).

% decide if "y" is a consonant

is_consonant( Word, Index ) :-
   extract( Word, Index, Letter ),
   (  is_cons( Letter )
   -> (  Letter = `y`,
         extract( Word, Index-1, Previous ),
         is_cons( Previous )
      -> fail
      ;  true
      )
   ;  fail
   ).

% it's a vowel if it's not a consonant

is_vowel( Word, Index ) :-
   \+ is_consonant( Word, Index ).

% test word ending, returning base if a match

ends_with( Stem, Base, Ends ) :-
   len( Stem, Length ),
   len( Ends, Short ),
   Length > Short,
   Split is Length - Short,
   cat( List, Stem, [Split] ),
   List = [Base,Ends].

% see if a stem contains a vowel

contains_vowel( Stem ) :-
   len( Stem, Length ),
   (  repeat( Length, Index ),
      extract( Stem, Index, Letter ),
      (  \+ is_cons( Letter )
      -> true
      ;  Letter = `y`
      -> true
      ;  fail
      )
   ;  fail
   ).

% see if a stem ends in a double consonant

double_cons( Stem ) :-
   len( Stem, Length ),
   (  Length > 2
   -> (  extract( Stem, Length, Letter ),
         extract( Stem, Length-1, Letter ),
         is_cons( Letter )
      -> true
      ;  fail
      )
   ;  fail
   ).

% get the form of a word

get_form( Word, Form ) :-
   Prev = ``,
   len( Word, Length ),
   (  repeat( Length, Index ),
      (  is_consonant( Word, Index )
      -> (  Prev \= `C`
         -> swrite( `C` ),
            asn( [Prev], `C` ),
            fail
         )
      ;  (  Prev \= `V`
         -> swrite( `V` ),
            asn( [Prev], `V` ),
            fail
         )
      )
   ;  true
   ) ~> Text,
   Form = Text.

% get the measure of a word

get_m( Word, Mval ) :-
   get_form( Word, Form ),
   Temp = 0,
   (  repeat,
      find( `VC`, 0, Find ),
      (  Find = `VC`
      -> More is Temp + 1,
         asn( [Temp], More ),
         fail
      ;  true
      )
   -> Mval = Temp
   ) <~ Form.

% see of a word ends in consonant, vowel, consonant

cvc( Word ) :-
   len( Word, Length ),
   (  Length >= 3
   -> (  is_consonant( Word, Length ),
         is_vowel( Word, Length-1 ),
         is_consonant( Word, Length-2 ),
         extract( Word, Length, Letter ),
         \+ member( Letter, [`w`,`x`,`y`] )
      -> true
      ;  fail
      )
   ;  fail
   ).

% replace the ending of a word

replace( Orig, Rem, Rep, Reps ) :-
   len( Orig, Length ),
   len( Rem, Short ),
   Length > Short,
   Split is Length - Short,
   cat( List, Orig, [Split] ),
   (  List = [Base,Rem]
   -> cat( [Base,Rep], Text, _ )
   ;  Text = Rep
   ),
   Reps = Text.

% replace the ending of a word where m > 0

replace_m0( Orig, Rem, Rep, Reps ) :-
   len( Orig, Length ),
   len( Rem, Short ),
   Length > Short,
   Split is Length - Short,
   cat( List, Orig, [Split] ),
   (  List = [Base,Rem],
      get_m( Base, Mval ),
      Mval > 0
   -> cat( [Base,Rep], Text, _ )
   ;  Text = Orig
   ),
   Reps = Text.

% replace the ending of a word where m > 1

replace_m1( Orig, Rem, Rep, Reps ) :-
   len( Orig, Length ),
   len( Rem, Short ),
   Length > Short,
   Split is Length - Short,
   cat( List, Orig, [Split] ),
   (  List = [Base,Rem],
      get_m( Base, Mval ),
      Mval > 1
   -> cat( [Base,Rep], Text, _ )
   ;  Text = Orig
   ),
   Reps = Text.

% extract a given letter from a word

extract( Word, Index, Letter ) :-
   Split is Index - 1,
   cat( List, Word, [Split,1] ),
   List = [_,Letter,_].

% sanitise a sentence isolating any non-alphabetic characters

sanitise( String, Stretch ) :-
   len( String, Length ),
   (  repeat( Length, Index ),
      extract( String, Index, Letter ),
      (  (  cmp( -1, Letter, `A` )
         ;  cmp( 1, Letter, `Z` ),
            cmp( -1, Letter, `a` )
         ;  cmp( 1, Letter, `z` )
         )
      -> swrite( ` ` ),
         swrite( Letter ),
         swrite( ` ` )
      ;  swrite( Letter )
      ),
      Index = Length
   -> true
   ) ~> Text,
   Stretch = Text.

% tokenise a sentence into a list (taken from strings.pl)

tokenise( String, Tokens ) :-
   findall( Token,
            (  repeat,
               find( ``, 0, Find ),
               (  Find = ``
               -> !,
                  fail
               ;  scan( [], 2, _ ) ~> Token
               )
            ) <~ String,
            Tokens
          ).