[go: up one dir, main page]

Menu

[r49]: / I18N / Arabic / Stemmer.php  Maximize  Restore  History

Download this file

238 lines (220 with data), 8.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
<?php
/**
* ----------------------------------------------------------------------
*
* Copyright (c) 2006-2013 Khaled Al-Sham'aa.
*
* http://www.ar-php.org
*
* PHP Version 5
*
* ----------------------------------------------------------------------
*
* LICENSE
*
* This program is open source product; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License (LGPL)
* as published by the Free Software Foundation; either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/lgpl.txt>.
*
* ----------------------------------------------------------------------
*
* Class Name: Arabic Text ArStemmer Class
*
* Filename: Stemmer.php
*
* Original Author(s): Khaled Al-Sham'aa <khaled@ar-php.org>
*
* Purpose: Get stem of an Arabic word
*
* ----------------------------------------------------------------------
*
* Source: http://arabtechies.net/node/83
* By: Taha Zerrouki <taha.zerrouki@gmail.com>
*
* ----------------------------------------------------------------------
*
* Arabic Word Stemmer Class
*
* PHP class to get stem of an Arabic word
*
* A stemmer is an automatic process in which morphological variants of terms
* are mapped to a single representative string called a stem. Arabic belongs
* to the Semitic family of languages which also includes Hebrew and Aramaic.
* Since morphological change in Arabic results from the addition of prefixes
* and infixes as well as suffixes, simple removal of suffixes is not as
* effective for Arabic as it is for English.
*
* Arabic has much richer morphology than English. Arabic has two genders,
* feminine and masculine; three numbers, singular, dual, and plural; and three
* grammatical cases, nominative, genitive, and accusative. A noun has the
* nominative case when it is a subject; accusative when it is the object of a
* verb; and genitive when it is the object of a preposition. The form of an
* Arabic noun is determined by its gender, number, and grammatical case. The
* definitive nouns are formed by attaching the Arabic article "AL" to the
* immediate front of the nouns. Besides prefixes, a noun can also carry a
* suffix which is often a possessive pronoun. In Arabic, the conjunction word
* "WA" (and) is often attached to the following word.
*
* Like nouns, an Arabic adjective can also have many variants. When an
* adjective modifies a noun in a noun phrase, the adjective agrees with the
* noun in gender, number, case, and definiteness. Arabic verbs have two tenses:
* perfect and imperfect. Perfect tense denotes actions completed, while
* imperfect denotes uncompleted actions. The imperfect tense has four mood:
* indicative, subjective, jussive, and imperative. Arabic verbs in perfect
* tense consist of a stem and a subject marker. The subject marker indicates
* the person, gender, and number of the subject. The form of a verb in perfect
* tense can have subject marker and pronoun suffix. The form of a
* subject-marker is determined together by the person, gender, and number of
* the subject.
* Example:
* <code>
* include('./I18N/Arabic.php');
* $obj = new I18N_Arabic('Stemmer');
*
* echo $obj->stem($word);
* </code>
*
* @category I18N
* @package I18N_Arabic
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
* @copyright 2006-2013 Khaled Al-Sham'aa
*
* @license LGPL <http://www.gnu.org/licenses/lgpl.txt>
* @link http://www.ar-php.org
*/
// New in PHP V5.3: Namespaces
// namespace I18N\Arabic;
//
// $obj = new I18N\Arabic\Stemmer();
//
// use I18N\Arabic;
// $obj = new Arabic\Stemmer();
//
// use I18N\Arabic\Stemmer as Stemmer;
// $obj = new Stemmer();
/**
* This PHP class get stem of an Arabic word
*
* @category I18N
* @package I18N_Arabic
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
* @copyright 2006-2013 Khaled Al-Sham'aa
*
* @license LGPL <http://www.gnu.org/licenses/lgpl.txt>
* @link http://www.ar-php.org
*/
class I18N_Arabic_Stemmer
{
private static $_verbPre = 'وأسفلي';
private static $_verbPost = 'ومكانيه';
private static $_verbMay;
private static $_verbMaxPre = 4;
private static $_verbMaxPost = 6;
private static $_verbMinStem = 2;
private static $_nounPre = 'ابفكلوأ';
private static $_nounPost = 'اتةكمنهوي';
private static $_nounMay;
private static $_nounMaxPre = 4;
private static $_nounMaxPost = 6;
private static $_nounMinStem = 2;
/**
* Loads initialize values
*
* @ignore
*/
public function __construct()
{
self::$_verbMay = self::$_verbPre . self::$_verbPost;
self::$_nounMay = self::$_nounPre . self::$_nounPost;
}
/**
* Get rough stem of the given Arabic word
*
* @param string $word Arabic word you would like to get its stem
*
* @return string Arabic stem of the word
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
public static function stem($word)
{
$nounStem = self::roughStem(
$word, self::$_nounMay, self::$_nounPre, self::$_nounPost,
self::$_nounMaxPre, self::$_nounMaxPost, self::$_nounMinStem
);
$verbStem = self::roughStem(
$word, self::$_verbMay, self::$_verbPre, self::$_verbPost,
self::$_verbMaxPre, self::$_verbMaxPost, self::$_verbMinStem
);
if (mb_strlen($nounStem, 'UTF-8') < mb_strlen($verbStem, 'UTF-8')) {
$stem = $nounStem;
} else {
$stem = $verbStem;
}
return $stem;
}
/**
* Get rough stem of the given Arabic word (under specific rules)
*
* @param string $word Arabic word you would like to get its stem
* @param string $notChars Arabic chars those can't be in postfix or prefix
* @param string $preChars Arabic chars those may exists in the prefix
* @param string $postChars Arabic chars those may exists in the postfix
* @param integer $maxPre Max prefix length
* @param integer $maxPost Max postfix length
* @param integer $minStem Min stem length
*
* @return string Arabic stem of the word under giving rules
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
protected static function roughStem (
$word, $notChars, $preChars, $postChars, $maxPre, $maxPost, $minStem
) {
$right = -1;
$left = -1;
$max = mb_strlen($word, 'UTF-8');
for ($i=0; $i < $max; $i++) {
$needle = mb_substr($word, $i, 1, 'UTF-8');
if (mb_strpos($notChars, $needle, 0, 'UTF-8') === false) {
if ($right == -1) {
$right = $i;
}
$left = $i;
}
}
if ($right > $maxPre) {
$right = $maxPre;
}
if ($max - $left - 1 > $maxPost) {
$left = $max - $maxPost -1;
}
for ($i=0; $i < $right; $i++) {
$needle = mb_substr($word, $i, 1, 'UTF-8');
if (mb_strpos($preChars, $needle, 0, 'UTF-8') === false) {
$right = $i;
break;
}
}
for ($i=$max-1; $i>$left; $i--) {
$needle = mb_substr($word, $i, 1, 'UTF-8');
if (mb_strpos($postChars, $needle, 0, 'UTF-8') === false) {
$left = $i;
break;
}
}
if ($left - $right >= $minStem) {
$stem = mb_substr($word, $right, $left-$right+1, 'UTF-8');
} else {
$stem = null;
}
return $stem;
}
}