2012-06-24 61 views
0

我有一个问题,我需要将输入字符串拆分为Prefix,StemSuffix字分离算法转换为前缀词干和后缀

规则是:

Prefix = 0-4字符

Stem = 1- *字符

Suffix = 0-6个字符。

说我有输入“wbAlErbyp”,它需要被拆分左右(不完全的样本):

任何想法如何,我可以做到这一点?

enter image description here

编辑1:

好这里是我的老办法(其很长的和不专业的),我再也不会理解它,并想重新设计。

public static List<string> GetMatches(string inputTextArabic) 
    { 
     // store matches/results here 
     List<string> results = new List<string>(); 

     char[] arabicChars = inputTextArabic.ToCharArray(); 

     // convert all the arabic chars from array 
     // into latin chars in array 
     string latString = ""; 
     for (int i = 0; i < arabicChars.Length; i++) 
     { 
      switch (arabicChars[i]) 
      { 
       #region ARABIC TO LATIN TABLE 
       case 'ا': 
        latString += "A"; 
        break; 
       case 'آ': 
        latString += "|"; 
        break; 
       case 'ؤ': 
        latString += "&"; 
        break; 
       case 'ئ': 
        latString += "}"; 
        break; 
       case 'أ': 
        latString += ">"; 
        break; 
       case 'إ': 
        latString += "<"; 
        break; 
       case 'ء': 
        latString += @"\"; 
        break; 
       case 'ب': 
        latString += "b"; 
        break; 
       case 'ت': 
        latString += "t"; 
        break; 
       case 'ة': 
        latString += "p"; 
        break; 
       case 'ث': 
        latString += "v"; 
        break; 
       case 'ج': 
        latString += "j"; 
        break; 
       case 'ح': 
        latString += "H"; 
        break; 
       case 'خ': 
        latString += "x"; 
        break; 
       case 'د': 
        latString += "d"; 
        break; 
       case 'ذ': 
        latString += "*"; 
        break; 
       case 'ر': 
        latString += "r"; 
        break; 
       case 'ز': 
        latString += "z"; 
        break; 
       case 'س': 
        latString += "s"; 
        break; 
       case 'ش': 
        latString += "$"; 
        break; 
       case 'ص': 
        latString += "S"; 
        break; 
       case 'ض': 
        latString += "D"; 
        break; 
       case 'ط': 
        latString += "T"; 
        break; 
       case 'ظ': 
        latString += "Z"; 
        break; 
       case 'ع': 
        latString += "E"; 
        break; 
       case 'غ': 
        latString += "g"; 
        break; 
       case 'ـ': 
        latString += "_"; 
        break; 
       case 'ف': 
        latString += "f"; 
        break; 
       case 'ق': 
        latString += "q"; 
        break; 
       case 'ك': 
        latString += "k"; 
        break; 
       case 'ل': 
        latString += "l"; 
        break; 
       case 'م': 
        latString += "m"; 
        break; 
       case 'ن': 
        latString += "n"; 
        break; 
       case 'ه': 
        latString += "h"; 
        break; 
       case 'و': 
        latString += "w"; 
        break; 
       case 'ى': 
        latString += "Y"; 
        break; 
       case 'ي': 
        latString += "y"; 
        break; 
       case 'ً': 
        latString += "F"; 
        break; 
       case 'ٌ': 
        latString += "N"; 
        break; 
       case 'ٍ': 
        latString += "K"; 
        break; 
       case 'َ': 
        latString += "a"; 
        break; 
       case 'ُ': 
        latString += "u"; 
        break; 
       case 'ِ': 
        latString += "i"; 
        break; 
       case 'ّ': 
        latString += "~"; 
        break; 
       case 'ْ': 
        latString += "o"; 
        break; 
       #endregion 
      } 
     } 

     // loop thru different stem sizes 
     // stem is 1-* 
     int lenWord = latString.Length; 
     for (int lenStem = 1; lenStem <= lenWord; lenStem++) 
     { 

      // set max prefix size, strd is 4 but could be 
      // less depending on word size 
      int lenPrefMax = 4; 
      if (lenWord - lenStem < lenPrefMax) 
      { 
       lenPrefMax = lenWord - lenStem; 
      } 

      // loop thru different prefix sizes 
      // based on the max above 
      for (int lenPref = 0; lenPref <= lenPrefMax; lenPref++) 
      { 
       // set suffix max, std is 6, but could be 
       // less depending on word size 
       int lenSuffMax = 6; 
       if (lenWord - lenStem - lenPref < lenSuffMax) 
       { 
        lenSuffMax = lenWord - lenStem - lenPref; 
       } 

       // loop thru different suffix sizes 
       // based on the max above 
       for (int lenSuff = 0; lenSuff <= lenSuffMax; lenSuff++) 
       { 
        // if sum of parts doesnt equal word size 
        // it means its not a proper match, thus skip 
        if (lenPref + lenStem + lenSuff < lenWord) 
         continue; 

        // otherwise, these are the possible word bits 
        string prefix = latString.Substring(0, lenPref); 
        string stem = latString.Substring(lenPref, lenStem); 
        string suffix = latString.Substring(lenPref + lenStem, lenSuff); 

        // now see if they all exist in the relevant places 
        List<WordBit> prefMatches = (from x in prefixes where x.NoVowels == prefix select x).Distinct().ToList(); 
        List<WordBit> stemMatches = (from x in stems where x.NoVowels == stem select x).Distinct().ToList(); 
        List<WordBit> suffMatches = (from x in suffixes where x.NoVowels == suffix select x).Distinct().ToList(); 

        if (!(prefMatches.Count > 0 && stemMatches.Count > 0 && suffMatches.Count > 0)) 
         break;       

        // Now that they are found, see if they go together 
        // For each prefix, loop through every stem 
        foreach(WordBit prefMatch in prefMatches) 
        {      
         // for each stem, loop through all suffixes 
         foreach (WordBit stemMatch in stemMatches) 
         { 
          // Now we know there is a prefix, suffix and stem 
          foreach (WordBit suffMatch in suffMatches) 
          { 
           // get their types 
           string prefType = prefMatch.Type; 
           string stemType = stemMatch.Type; 
           string suffType = suffMatch.Type; 

           // find out if the types are compatible 
           bool prefStemConnects = (from x in prefixStemConns where x.Type1 == prefType && x.Type2 == stemType select x).Count() > 0; 
           bool stemSuffConnects = (from x in stemSuffixConns where x.Type1 == stemType && x.Type2 == suffType select x).Count() > 0; 
           bool prefSuffConnects = (from x in prefixSuffixConns where x.Type1 == prefType && x.Type2 == suffType select x).Count() > 0; 

           // they all connect 
           // we have found a match! 
           if (prefStemConnects && stemSuffConnects && prefStemConnects) 
           { 

            Match match = new Match(); 
            //match.MatchMeaning = ""; 

            // 1. prefix 
            // 2. stem 
            // 3. 
           //takeplusesout 
            match.MatchMeaning = match.RootMeaning = Regex.Match(stemMatch.Extra, @"^.*?(?=\s\s|$)").ToString(); 

            // [fem.sg.] = I 


            match.SuffixInfo = Regex.Match(suffMatch.Extra, @"^.*?(?=\s\s)").ToString(); 
            if (match.SuffixInfo != "") 
            { 
             if (match.SuffixInfo.Contains("<verb>")) 
             { 
              match.MatchMeaning = match.SuffixInfo.Replace("<verb>", match.RootMeaning); 
              match.SuffixInfo = ""; 
             } 
             else 
             { 
              match.MatchMeaning = match.MatchMeaning + " " + match.SuffixInfo; 
             } 
            } 

            // Get 1st part of prefix 
            match.PrefixInfo = Regex.Match(prefMatch.Extra, @"^.*?(?=\s|\s\s|$)").ToString(); 
            if (match.PrefixInfo != "") 
            { 
             match.MatchMeaning = match.PrefixInfo + " " + match.RootMeaning + " " + match.SuffixInfo; 
            } 

            //results.Add(prefMatch.Extra + "--" + stemMatch.Extra + "--" + suffMatch.Extra); 
            //if (beforeMeaning != "") 
            // beforeMeaning += " "; 

            results.Add(match.MatchMeaning); 


            Debug.Print("_____________________________________________________________________________________"); 
            Debug.Print(prefMatch.NoVowels + "\t\t" + prefMatch.Vowels + "\t\t" + prefMatch.Type + "\t\t" + prefMatch.Extra); 
            Debug.Print(stemMatch.NoVowels + "\t\t" + stemMatch.Vowels + "\t\t" + stemMatch.Type + "\t\t" + stemMatch.Extra); 
            Debug.Print(suffMatch.NoVowels + "\t\t" + suffMatch.Vowels + "\t\t" + suffMatch.Type + "\t\t" + suffMatch.Extra); 
            Debug.Print("______________________________________________________________________________________"); 
           } 
          } 
         } 
        } 
       } 
      } 
     } 

     return results; 
    } 
+0

到目前为止,你有没有尝试过任何东西?如果你显示一些初步的努力,你更有可能获得帮助。 –

+0

没有足够的信息。你怎么知道前缀的结尾和词干开始?后缀同样适用。没有这些信息,你就无法做到。 – Will

+0

好吧等待我发布我目前的算法。威尔,你不会知道,你会做出所有可能的组合,然后我会对我的数据进行搜索。 – sprocket12

回答

1

只需构建两个嵌套循环,遍历所有可能的前缀和后缀长度。

string s="wbAlErbyp"; 

const int maxPrefixLength = 4; 
const int maxSuffixLength = 6; 
const int minStemLength = 1; 

for(int prefixLength = 0; (prefixLength + minStemLength <= s.Length) && (prefixLength<=maxPrefixLength); prefixLength++) 
    for(int suffixLength = 0; (suffixLength + prefixLength + minStemLength <= s.Length) && (suffixLength<=maxSuffixLength); suffixLength++) 
    { 
     string prefix = s.Substring(0, prefixLength); 
     string suffix = s.Substring(s.Length-suffixLength); 
     string stem = s.Substring(prefixLength, s.Length-suffixLength-prefixLength); 
     Console.WriteLine("{0} {1} {2}",prefix,stem, suffix); 
    } 
+0

天才!你是如何解决这个问题的? – sprocket12

+0

@MuhammadA几乎和你一样。唯一重要的区别是我把条件推入了“堕胎”的状况。 – CodesInChaos