2013-08-29 33 views
-2

我想从字符串或(段落)中删除英文单词。但问题是所有的话都没有被删除。但是当我用25个字的小字符串尝试时,它完美地工作。 我想在3个步骤来筛选这样的:如何将一个段落分成小的子字符串?

  1. 从字符串
  2. 删除XML标签移除链接。
  3. 删除英文单词。

下面是代码:

String SWList[]; 
public ArrayList<String> tokens = new ArrayList<String>(); 
String sentenceSoFar=""; 
String nextToken; 
String withoutLink=null; 
ArrayList<String> wordscount = new ArrayList<String>(); 
boolean flag=false; 
String str; 
int counter; 
String finalStr=""; 
ArrayList<String> spaceCheck = new ArrayList<String>(); 

public void removeLinks(String str) { 
    sentenceSoFar=null; 
    String delims = " "; 
    StringTokenizer st = new StringTokenizer(str, delims); 
    sentenceSoFar=null; 
    while (st.hasMoreTokens()) { 
     String str1 = "http"; 
     String nextToken = st.nextToken(); 
     //System.out.println("LINK CHECK : " + nextToken); 

     if (!(nextToken.contains(str1))) { 
      flag = false; 
      if (counter == 0) { 
       tokens.add(nextToken); 
       sentenceSoFar += " " + nextToken; 
       withoutLink+= " " + nextToken; 
      } else { 
       if (nextToken.contains(str1)) { 
        withoutLink = nextToken; 
        counter=1; 
       } 
      } 
     } 
    } 

    //System.out.println("sentence wala : " + sentenceSoFar); 
    removeXmlTags(sentenceSoFar); 
    sentenceSoFar=null; 

} 

public void removeXmlTags(String strTags) { 
    //strTags=null; 

    String[] stopWords = new String[] { 
     "&amp;nbsp;&lt;/p&gt;", 
     " &amp;nbsp;&lt;/p&gt; ", 
     ".&lt;/p&gt;"," .&lt;/p&gt; ", 
     "1??&quot;&gt;&lt;span&gt;&amp;nbsp;", 
     " 1??&quot;&gt;&lt;span&gt;&amp;nbsp; ", 
     "&lt;p"," &lt;p ", 
     " align=&quot;center&quot;&gt; ", 
     " align=&quot;center&quot;&gt;", 
     ";", 
     "&", 
     "/&", 
     "&lt", 
     " &lt ", 
     "_rdEdi", 
     " _rdEdi ", 
     "br", 
     " br ", 
     "gt", 
     " gt ", 
     "exLink", 
     " exLink ", 
     "link", 
     " link ", 
     "&gt", 
     " &gt ", 
     "style", 
     " style ", 
     ";/div& ", 
     "class", 
     " cestry ", 
     "-", 
     " - ", 
     "nb", 
     " nb ", 
     " a ", 
     "&lt;p&gt;", 
     "&#160;", 
     ";/b&", 
     ",", 
     "/", 
     " It ", 
     " strong ", 
     " span ", 
     " Responsibilities ", 
     " bull ", 
     " amp ", 
     " b ", 
     " d ", 
     " e ", 
     " f ", 
     " g ", 
     " h ", 
     " i ", 
     " j ", 
     " k ", 
     " l ", 
     " m ", 
     " n ", 
     " o ", 
     " p ", 
     " q ", 
     " r ", 
     " s ", 
     " t ", 
     " u ", 
     " v ", 
     " w ", 
     " x ", 
    }; 

    { 
     for (String stopword : stopWords) { 
      strTags = strTags.replaceAll("(?i)"+stopword, " "); 
     } 
    } 

    //System.out.println("OUTPUT STRING WITHOUT TAGS : " + strTags); 
    englishWords(strTags); 
    strTags=null; 
} 

public void englishWords(String strWords) { 

    finalStr=null; 
    String[] stopWords = new String[]{ 
     " i " , " a " , " natural " , " and " , " if " ," your" ," about " , " an " , " are " , " as " , " at " , " be " , " by " ," was " ," leadership " , 
     " com " , " for " , " from " , " how " , " in " , " is " , " it " , " not " , " of " , " on " , " or " , " that " , " the " , " this " , " to " , "lt","quot", 
     " what " , " when " , " where " , " who " , " will " , " with " , " the " , " www " ," role " ," provides" ," you " ,"&amp;nbsp;&lt;/p&gt; " ,"align","temp","tor", 
     " Inc." ," Inc." ," is " ," an " ," equal" ," equal " ," Opportunity" ," Opportunity " ," Employer" ," Employer " ," The " ," company" ," candidates" ,"center", 
     " company " ," its" ," affiliates" ," affiliates " ," recruit " ," hire " ," qualified " ," candidates" ," candidates " ," today " ," Facebook " , 
     " without " ," without " ," regard" ," regard " ," to " ," race" ," race " ," religion" ," religion " ," color " ," color " , " sex " ," sexual " , 
     " sexual " ," orientation " ," orientation " ," gender " ," gender " , " identity " ," identity " ," age " ," national " ," national " ," origin" , 
     " origin " ," ancestry" ," ancestry " ," citizenship" ," citizenship " , " veteran" ," veteran" , " or " ," disability" ," disability " ," status" , 
     " status " ," medical" ," medical " ," condition" ," condition " ," marital" ," marital " , " any" ," any " , " other" ," other " ," factor" ," factor " , 
     " prohibited" ," prohibited " ," state " ," state " ," provincial" ," provincial " ," and " ," federal" ," federal " ," municipal" ," municipal " , 
     " it " ," ul " ," LI " ," HR " ," div " ," it " ," ul " ," lt " , " sp " , " Nurse " ," join " ," our " ," Overview " ," specializes " ," highly " ," sampling " , 
     " Description " ," Requirements " ," Intensive " ," Care " ," StartDate " ," ASAP " ," Available " ," Shifts " ," Exclusive " ," order " ," Serving " , 
     " throughout " ," county " ," members " ," range " ," more " ," provide " ," Emergency " ," currently " ," customer " ," unparalleled " ," Spending" , 
     " looking " ," Critical " ," Facility " ," boggling " ," entertainment " ," service " ," benefits " ," commitment " ," outdoor " ," comprehensive " , 
     " settings " ," patient " ," exhilarating " ," interventions " ," environments " ," nurses " ," needs " ," travel " ," primary " ," see " ," experience" , 
     " gas " ," transportation " ," machine " ," construction " ," mining " ," industries " ," detailed " ," corrective " ," action " ," both " ," management " , 
     " management " ," Receiving " ," Inspection " ," verification " ," established " ," which " ," material " ," acceptance " ," measurement " , 
     ," training " ," Familiar " ," shipment " , 
     " levels " ," drawings " ," knowledge " ," Recruiter " ," Recruiter: " ," long " ," short " ," years " ," opportunities " ," competition " ," until " ," Email " ," here " ," quot " ," replace " ," schedule " ," Flexible " , 
     " these" ," can " ," manage " ," multiple " ," tasks " ," simultaneously " ," adapt " ," market " ," changes? " ," basic " ," qualifications " ," only " , 
     " half " ," story " ," considering " ," 7 " ," eleven " ," right " ," choice " ," should " ," consider " ," they " ," possess " ," traits " ," most " ," common " , 
     " successful " ," 7 " ," eleven " ," franchisees " ," can " ," train " ," supervise " ," employees? " ," willing " ," empower " ," them " ," delegate " ," them? " , 
     " dedicated " ," operations " ," excellence? " ," do " ," focus " ," details? " ," committed " ," creating " ," managing " ," organization " ," effectively " , 
     " recruits " ," trains " ," retains " ," motivates " ," people " ," do " ," have " ," desire " ," build " ," emental " ," me " ," through " ," execution " ," ability " , 
     " programs " ," strategies? " ," do" ," have " ," food" ," can " , " aur " , " join " 
    }; 

    for (String stopword : stopWords) 
    { 
     strWords = strWords.replaceAll("(?i)"+stopword, " "); 
    } 

    String delims = " , = ; : ' * % $ @ 0 - _ + () ."; 
    StringTokenizer st = new StringTokenizer(strWords, delims); 

    finalStr =null; 
    while(st.hasMoreTokens()) { 

     String ntoken = st.nextToken(); 
     // System.out.println("LINK CHECK : " + ntoken); 

     tokens.add(ntoken); 
     finalStr += " "+ ntoken; 
     //withoutLink+= " " + nextToken; 

    } 

    // System.out.println("Different STRING : " + finalStr); 
    // new indexing.IndexAlgo().algoOne(finalStr); 
    finalStr=null; 
} 

我真的很感激,如果有人能帮助我与一些更好的逻辑或代码。

+2

这就是一个地狱意大利面代码...... = o –

+0

@Maciej Cygan:我认为OP可能自己没有编码这些东西,这就是为什么他不理解代码。 其中很多都是无稽之谈。 – user919860

+0

我不明白'removeLinks'方法的while循环中else语句的意义。 '计数器'保持0,那有什么意义?而且,为什么你将flag设置为false?它没有被使用在任何地方。你是否打算在while循环的每次迭代中增加counter?仅供参考,您的第一个方法肯定会起作用,除非您在结果字符串的开始处会有一个“null”值。但是,有相当多的代码,你可以不用。 – user919860

回答

0

第一种和第三种方法看起来像是应该工作,尽管它们的代码行不通,这是毫无意义的。但是,我并没有深入研究第二个,我猜是因为我对Regular Experssions没那么强大。我以为你应该使用/p以及一个unicode属性。我不知道这里的表达\p应该是什么意思。

我不确定你使用的是什么样的输入以及你的结果是什么。你为什么认为你的代码不工作?

+0

输入字符串是XML格式,我已经解析并且已经将它传递给removeLink方法。现在我想先删除链接,然后是XML标签,然后是所有英文单词...... – user2446605

+0

前两种方法可以正常工作,但我有第三种方法的问题。假设一个段落的长度很小,例如25个单词,那么所有在stopWords中的单词都会被过滤掉,但是当字符串1000或1500的单词长度没有被过滤时。所以我想在执行englishWord方法时删除所有英文单词或所有在stopWords数组中的单词 – user2446605

相关问题