Imported Upstream version 1.77.1
[platform/upstream/docbook-xsl-stylesheets.git] / webhelp / docs / content / search / stemmers / en_stemmer.js
1 // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
2 // paper, in
3 //
4 //  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
5 //  no. 3, pp 130-137,
6 //
7 // see also http://www.tartarus.org/~martin/PorterStemmer
8
9 // Release 1
10 // Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009
11
12 var stemmer = (function(){
13         var step2list = {
14                         "ational" : "ate",
15                         "tional" : "tion",
16                         "enci" : "ence",
17                         "anci" : "ance",
18                         "izer" : "ize",
19                         "bli" : "ble",
20                         "alli" : "al",
21                         "entli" : "ent",
22                         "eli" : "e",
23                         "ousli" : "ous",
24                         "ization" : "ize",
25                         "ation" : "ate",
26                         "ator" : "ate",
27                         "alism" : "al",
28                         "iveness" : "ive",
29                         "fulness" : "ful",
30                         "ousness" : "ous",
31                         "aliti" : "al",
32                         "iviti" : "ive",
33                         "biliti" : "ble",
34                         "logi" : "log"
35                 },
36
37                 step3list = {
38                         "icate" : "ic",
39                         "ative" : "",
40                         "alize" : "al",
41                         "iciti" : "ic",
42                         "ical" : "ic",
43                         "ful" : "",
44                         "ness" : ""
45                 },
46
47                 c = "[^aeiou]",          // consonant
48                 v = "[aeiouy]",          // vowel
49                 C = c + "[^aeiouy]*",    // consonant sequence
50                 V = v + "[aeiou]*",      // vowel sequence
51
52                 mgr0 = "^(" + C + ")?" + V + C,               // [C]VC... is m>0
53                 meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$",  // [C]VC[V] is m=1
54                 mgr1 = "^(" + C + ")?" + V + C + V + C,       // [C]VCVC... is m>1
55                 s_v = "^(" + C + ")?" + v;                   // vowel in stem
56
57         return function (w) {
58                 var     stem,
59                         suffix,
60                         firstch,
61                         re,
62                         re2,
63                         re3,
64                         re4,
65                         origword = w;
66
67                 if (w.length < 3) { return w; }
68
69                 firstch = w.substr(0,1);
70                 if (firstch == "y") {
71                         w = firstch.toUpperCase() + w.substr(1);
72                 }
73
74                 // Step 1a
75                 re = /^(.+?)(ss|i)es$/;
76                 re2 = /^(.+?)([^s])s$/;
77
78                 if (re.test(w)) { w = w.replace(re,"$1$2"); }
79                 else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
80
81                 // Step 1b
82                 re = /^(.+?)eed$/;
83                 re2 = /^(.+?)(ed|ing)$/;
84                 if (re.test(w)) {
85                         var fp = re.exec(w);
86                         re = new RegExp(mgr0);
87                         if (re.test(fp[1])) {
88                                 re = /.$/;
89                                 w = w.replace(re,"");
90                         }
91                 } else if (re2.test(w)) {
92                         var fp = re2.exec(w);
93                         stem = fp[1];
94                         re2 = new RegExp(s_v);
95                         if (re2.test(stem)) {
96                                 w = stem;
97                                 re2 = /(at|bl|iz)$/;
98                                 re3 = new RegExp("([^aeiouylsz])\\1$");
99                                 re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
100                                 if (re2.test(w)) { w = w + "e"; }
101                                 else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
102                                 else if (re4.test(w)) { w = w + "e"; }
103                         }
104                 }
105
106                 // Step 1c
107                 re = new RegExp("^(.+" + c + ")y$");
108                     if (re.test(w)) {
109                         var fp = re.exec(w);
110                         stem = fp[1];
111                     w = stem + "i";
112                 }
113
114                 // Step 2
115                 re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
116                 if (re.test(w)) {
117                         var fp = re.exec(w);
118                         stem = fp[1];
119                         suffix = fp[2];
120                         re = new RegExp(mgr0);
121                         if (re.test(stem)) {
122                                 w = stem + step2list[suffix];
123                         }
124                 }
125
126                 // Step 3
127                 re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
128                 if (re.test(w)) {
129                         var fp = re.exec(w);
130                         stem = fp[1];
131                         suffix = fp[2];
132                         re = new RegExp(mgr0);
133                         if (re.test(stem)) {
134                                 w = stem + step3list[suffix];
135                         }
136                 }
137
138                 // Step 4
139                 re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
140                 re2 = /^(.+?)(s|t)(ion)$/;
141                 if (re.test(w)) {
142                         var fp = re.exec(w);
143                         stem = fp[1];
144                         re = new RegExp(mgr1);
145                         if (re.test(stem)) {
146                                 w = stem;
147                         }
148                 } else if (re2.test(w)) {
149                         var fp = re2.exec(w);
150                         stem = fp[1] + fp[2];
151                         re2 = new RegExp(mgr1);
152                         if (re2.test(stem)) {
153                                 w = stem;
154                         }
155                 }
156
157                 // Step 5
158                 re = /^(.+?)e$/;
159                 if (re.test(w)) {
160                         var fp = re.exec(w);
161                         stem = fp[1];
162                         re = new RegExp(mgr1);
163                         re2 = new RegExp(meq1);
164                         re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
165                         if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
166                                 w = stem;
167                         }
168                 }
169
170                 re = /ll$/;
171                 re2 = new RegExp(mgr1);
172                 if (re.test(w) && re2.test(w)) {
173                         re = /.$/;
174                         w = w.replace(re,"");
175                 }
176
177                 // and turn initial Y back to y
178
179                 if (firstch == "y") {
180                         w = firstch.toLowerCase() + w.substr(1);
181                 }
182
183             // See http://snowball.tartarus.org/algorithms/english/stemmer.html
184             // "Exceptional forms in general"
185             var specialWords = {
186                 "skis" : "ski",
187                 "skies" : "sky",
188                 "dying" : "die",
189                 "lying" : "lie",
190                 "tying" : "tie",
191                 "idly" : "idl",
192                 "gently" : "gentl",
193                 "ugly" : "ugli",
194                 "early": "earli",
195                 "only": "onli",
196                 "singly": "singl"
197             };
198
199             if(specialWords[origword]){
200                 w = specialWords[origword];
201             }
202
203             if( "sky news howe atlas cosmos bias \
204                  andes inning outing canning herring \
205                  earring proceed exceed succeed".indexOf(origword) !== -1 ){
206                 w = origword;
207             }
208
209             // Address words overstemmed as gener-
210             re = /.*generate?s?d?(ing)?$/;
211             if( re.test(origword) ){
212                 w = w + 'at';
213             }
214             re = /.*general(ly)?$/;
215             if( re.test(origword) ){
216                 w = w + 'al';
217             }
218             re = /.*generic(ally)?$/;
219             if( re.test(origword) ){
220                 w = w + 'ic';
221             }
222             re = /.*generous(ly)?$/;
223             if( re.test(origword) ){
224                 w = w + 'ous';
225             }
226             // Address words overstemmed as commun-
227             re = /.*communit(ies)?y?/;
228             if( re.test(origword) ){
229                 w = w + 'iti';
230             }
231
232             return w;
233         }
234 })();