1 // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
4 // Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
7 // see also http://www.tartarus.org/~martin/PorterStemmer
10 // Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009
12 var stemmer = (function(){
47 c = "[^aeiou]", // consonant
48 v = "[aeiouy]", // vowel
49 C = c + "[^aeiouy]*", // consonant sequence
50 V = v + "[aeiou]*", // vowel sequence
52 mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
53 meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
54 mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
55 s_v = "^(" + C + ")?" + v; // vowel in stem
67 if (w.length < 3) { return w; }
69 firstch = w.substr(0,1);
71 w = firstch.toUpperCase() + w.substr(1);
75 re = /^(.+?)(ss|i)es$/;
76 re2 = /^(.+?)([^s])s$/;
78 if (re.test(w)) { w = w.replace(re,"$1$2"); }
79 else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
83 re2 = /^(.+?)(ed|ing)$/;
86 re = new RegExp(mgr0);
91 } else if (re2.test(w)) {
94 re2 = new RegExp(s_v);
98 re3 = new RegExp("([^aeiouylsz])\\1$");
99 re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
100 if (re2.test(w)) { w = w + "e"; }
101 else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
102 else if (re4.test(w)) { w = w + "e"; }
107 re = new RegExp("^(.+" + c + ")y$");
115 re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
120 re = new RegExp(mgr0);
122 w = stem + step2list[suffix];
127 re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
132 re = new RegExp(mgr0);
134 w = stem + step3list[suffix];
139 re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
140 re2 = /^(.+?)(s|t)(ion)$/;
144 re = new RegExp(mgr1);
148 } else if (re2.test(w)) {
149 var fp = re2.exec(w);
150 stem = fp[1] + fp[2];
151 re2 = new RegExp(mgr1);
152 if (re2.test(stem)) {
162 re = new RegExp(mgr1);
163 re2 = new RegExp(meq1);
164 re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
165 if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
171 re2 = new RegExp(mgr1);
172 if (re.test(w) && re2.test(w)) {
174 w = w.replace(re,"");
177 // and turn initial Y back to y
179 if (firstch == "y") {
180 w = firstch.toLowerCase() + w.substr(1);
183 // See http://snowball.tartarus.org/algorithms/english/stemmer.html
184 // "Exceptional forms in general"
199 if(specialWords[origword]){
200 w = specialWords[origword];
203 if( "sky news howe atlas cosmos bias \
204 andes inning outing canning herring \
205 earring proceed exceed succeed".indexOf(origword) !== -1 ){
209 // Address words overstemmed as gener-
210 re = /.*generate?s?d?(ing)?$/;
211 if( re.test(origword) ){
214 re = /.*general(ly)?$/;
215 if( re.test(origword) ){
218 re = /.*generic(ally)?$/;
219 if( re.test(origword) ){
222 re = /.*generous(ly)?$/;
223 if( re.test(origword) ){
226 // Address words overstemmed as commun-
227 re = /.*communit(ies)?y?/;
228 if( re.test(origword) ){