2013-03-08 151 views
0

我很想知道我是否可以检测变形(如狗/狗),删除不重要的单词(“在美国制造” - > “in”和“the”不重要)等等,而不用在一个大的PHP代码块中硬编码这样的许多场景,用户为Magento搜索引擎输入的搜索字符串。我可以在一定程度上处理这个搜索字符串,但它看起来不卫生和丑陋。Magento:改进搜索引擎(变形,无关词语移除等)

任何建议或使其成为“intelliegent”搜索引擎的指针?

回答

0

使用这个类:

class Inflection 
{ 
    static $plural = array(
    '/(quiz)$/i' => "$1zes", 
    '/^(ox)$/i' => "$1en", 
    '/([m|l])ouse$/i' => "$1ice", 
    '/(matr|vert|ind)ix|ex$/i' => "$1ices", 
    '/(x|ch|ss|sh)$/i' => "$1es", 
    '/([^aeiouy]|qu)y$/i' => "$1ies", 
    '/(hive)$/i' => "$1s", 
    '/(?:([^f])fe|([lr])f)$/i' => "$1$2ves", 
    '/(shea|lea|loa|thie)f$/i' => "$1ves", 
    '/sis$/i' => "ses", 
    '/([ti])um$/i' => "$1a", 
    '/(tomat|potat|ech|her|vet)o$/i'=> "$1oes", 
    '/(bu)s$/i' => "$1ses", 
    '/(alias)$/i' => "$1es", 
    '/(octop)us$/i' => "$1i", 
    '/(ax|test)is$/i' => "$1es", 
    '/(us)$/i' => "$1es", 
    '/s$/i' => "s", 
    '/$/' => "s" 
    ); 

    static $singular = array(
    '/(quiz)zes$/i' => "$1", 
    '/(matr)ices$/i' => "$1ix", 
    '/(vert|ind)ices$/i' => "$1ex", 
    '/^(ox)en$/i' => "$1", 
    '/(alias)es$/i' => "$1", 
    '/(octop|vir)i$/i' => "$1us", 
    '/(cris|ax|test)es$/i' => "$1is", 
    '/(shoe)s$/i' => "$1", 
    '/(o)es$/i' => "$1", 
    '/(bus)es$/i' => "$1", 
    '/([m|l])ice$/i' => "$1ouse", 
    '/(x|ch|ss|sh)es$/i' => "$1", 
    '/(m)ovies$/i' => "$1ovie", 
    '/(s)eries$/i' => "$1eries", 
    '/([^aeiouy]|qu)ies$/i' => "$1y", 
    '/([lr])ves$/i' => "$1f", 
    '/(tive)s$/i' => "$1", 
    '/(hive)s$/i' => "$1", 
    '/(li|wi|kni)ves$/i' => "$1fe", 
    '/(shea|loa|lea|thie)ves$/i'=> "$1f", 
    '/(^analy)ses$/i' => "$1sis", 
    '/((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$/i' => "$1$2sis", 
    '/([ti])a$/i' => "$1um", 
    '/(n)ews$/i' => "$1ews", 
    '/(h|bl)ouses$/i' => "$1ouse", 
    '/(corpse)s$/i' => "$1", 
    '/(us)es$/i' => "$1", 
    '/s$/i' => "" 
    ); 

    static $irregular = array(
    'move' => 'moves', 
    'foot' => 'feet', 
    'goose' => 'geese', 
    'sex' => 'sexes', 
    'child' => 'children', 
    'man' => 'men', 
    'tooth' => 'teeth', 
    'person' => 'people', 
    'admin' => 'admin' 
    ); 

    static $uncountable = array(
    'sheep', 
    'fish', 
    'deer', 
    'series', 
    'species', 
    'money', 
    'rice', 
    'information', 
    'equipment' 
    ); 

    public static function pluralize($string) 
    { 
global $irregularWords; 

// save some time in the case that singular and plural are the same 
    if (in_array(strtolower($string), self::$uncountable)) 
     return $string; 

    // check for irregular singular forms 
    foreach ($irregularWords as $pattern => $result) 
    { 
     $pattern = '/' . $pattern . '$/i'; 

     if (preg_match($pattern, $string)) 
      return preg_replace($pattern, $result, $string); 
    } 

    // check for irregular singular forms 
    foreach (self::$irregular as $pattern => $result) 
    { 
     $pattern = '/' . $pattern . '$/i'; 

     if (preg_match($pattern, $string)) 
      return preg_replace($pattern, $result, $string); 
    } 

    // check for matches using regular expressions 
    foreach (self::$plural as $pattern => $result) 
    { 
     if (preg_match($pattern, $string)) 
      return preg_replace($pattern, $result, $string); 
    } 

    return $string; 
    } 

    public static function singularize($string) 
    { 
global $irregularWords; 
    // save some time in the case that singular and plural are the same 
    if (in_array(strtolower($string), self::$uncountable)) 
     return $string; 

// check for irregular words 
    foreach ($irregularWords as $result => $pattern) 
    { 
     $pattern = '/' . $pattern . '$/i'; 

     if (preg_match($pattern, $string)) 
      return preg_replace($pattern, $result, $string); 
    } 

// check for irregular plural forms 
    foreach (self::$irregular as $result => $pattern) 
    { 
     $pattern = '/' . $pattern . '$/i'; 

     if (preg_match($pattern, $string)) 
      return preg_replace($pattern, $result, $string); 
    } 

// check for matches using regular expressions 
    foreach (self::$singular as $pattern => $result) 
    { 
     if (preg_match($pattern, $string)) 
      return preg_replace($pattern, $result, $string); 
    } 

    return $string; 
    } 

    public static function pluralize_if($count, $string) 
    { 
    if ($count == 1) 
     return "1 $string"; 
    else 
     return $count . " " . self::pluralize($string); 
    } 
} 

如果你有时间使用拐点使用一种标准的方式:http://en.wikipedia.org/wiki/Inflection

可以作为阵列使用XML结合,从而把所有的拐点数据,看看如何codeigniter的变形非常友好:http://ellislab.com/codeigniter/user-guide/helpers/inflector_helper.html

许多框架都支持内置拐点,但它只会专注于主要英语。对于其他语言,您应该自己编写...或者如果您需要,可以使用unicode.org以及其他语言的某些转换标准。