2016-12-27 29 views
0

我已经用我自己的POS语料库(准确率超过90%)成功地评估了Lingpipe的POS标记的HMM实现。序列化/编译隐藏在Java中的HMM模型

为我自己的POS语料库评估POS HMM Ant文件是一样的布朗POS语料库:

<target name="eval-brown" 
        depends="compile"> 
  <java classname="EvaluatePos" 
        fork="true" 
        maxMemory="512M"> 
    <jvmarg value="-server"/> 
    <classpath refid="classpath.standard"/> 
    <arg value="1"/>                 <!-- sent eval rate --> 
    <arg value="50000"/>            <!-- toks before eval --> 
    <arg value="10"/>                <!-- max n-best --> 
    <arg value="8"/>                 <!-- n-gram size --> 
    <arg value="128"/>               <!-- num characters --> 
    <arg value="8.0"/>               <!-- interpolation ratio --> 
    <arg value="BrownPosCorpus"/>  <!-- corpus implementation class --> 
    <arg value="${data.pos.brown}"/>    <!-- baseline for data --> 
    <arg value="true"/>            <!--  smoothe tags --> 
  </java> 
</target> 

法评估HMM POS标注器类是:EvaluatePos.java给出如下。现在

public class EvaluatePos { 

    final int mSentEvalRate; 
    final int mToksBeforeEval; 
    final int mMaxNBest; 
    final int mNGram; 
    final int mNumChars; 
    final double mLambdaFactor; 
    final PosCorpus mCorpus; 

    final Set<String> mTagSet = new HashSet<String>(); 
    HmmCharLmEstimator mEstimator; 
    TaggerEvaluator<String> mTaggerEvaluator; 
    NBestTaggerEvaluator<String> mNBestTaggerEvaluator; 
    MarginalTaggerEvaluator<String> mMarginalTaggerEvaluator; 

    int mTrainingSentenceCount = 0; 
    int mTrainingTokenCount = 0; 

    public EvaluatePos(String[] args) throws Exception { 
     mSentEvalRate = Integer.valueOf(args[0]); 
     mToksBeforeEval = Integer.valueOf(args[1]); 
     mMaxNBest = Integer.valueOf(args[2]); 
     mNGram = Integer.valueOf(args[3]); 
     mNumChars = Integer.valueOf(args[4]); 
     mLambdaFactor = Double.valueOf(args[5]); 
     String constructorName = args[6]; 
     File corpusFile = new File(args[7]); 
     Object[] consArgs = new Object[] { corpusFile }; 
     @SuppressWarnings("rawtypes") // req 2 step 
     PosCorpus corpus 
      = (PosCorpus) 
      Class 
      .forName(constructorName) 
      .getConstructor(new Class[] { File.class }) 
      .newInstance(consArgs); 
     mCorpus = corpus; 
    } 

    void run() throws IOException { 
     System.out.println("\nCOMMAND PARAMETERS:"); 
     System.out.println(" Sent eval rate=" + mSentEvalRate); 
     System.out.println(" Toks before eval=" + mToksBeforeEval); 
     System.out.println(" Max n-best eval=" + mMaxNBest); 
     System.out.println(" Max n-gram=" + mNGram); 
     System.out.println(" Num chars=" + mNumChars); 
     System.out.println(" Lambda factor=" + mLambdaFactor); 

     CorpusProfileHandler profileHandler = new CorpusProfileHandler(); 
     parseCorpus(profileHandler); 
     String[] tags = mTagSet.toArray(Strings.EMPTY_STRING_ARRAY); 
     Arrays.sort(tags); 
     Set<String> tagSet = new HashSet<String>(); 
     for (String tag : tags) 
      tagSet.add(tag); 

     System.out.println("\nCORPUS PROFILE:"); 
     System.out.println(" Corpus class=" + mCorpus.getClass().getName()); 
     System.out.println(" #Sentences=" 
          + mTrainingSentenceCount); 
     System.out.println(" #Tokens=" + mTrainingTokenCount); 
     System.out.println(" #Tags=" + tags.length); 
     System.out.println(" Tags=" + Arrays.asList(tags)); 

     System.out.println("\nEVALUATION:"); 
     mEstimator 
      = new HmmCharLmEstimator(mNGram,mNumChars,mLambdaFactor); 
     for (int i = 0; i < tags.length; ++i) 
      mEstimator.addState(tags[i]); 

     HmmDecoder decoder 
      = new HmmDecoder(mEstimator); // no caching 
     boolean storeTokens = true; 
     mTaggerEvaluator 
      = new TaggerEvaluator<String>(decoder,storeTokens); 
     mNBestTaggerEvaluator 
      = new NBestTaggerEvaluator<String>(decoder,mMaxNBest,mMaxNBest); 
     mMarginalTaggerEvaluator 
      = new MarginalTaggerEvaluator<String>(decoder,tagSet,storeTokens); 

     LearningCurveHandler evaluationHandler 
      = new LearningCurveHandler(); 
     parseCorpus(evaluationHandler); 

     System.out.println("\n\n\nFINAL REPORT"); 

     System.out.println("\n\nFirst Best Evaluation"); 
     System.out.println(mTaggerEvaluator.tokenEval()); 

     System.out.println("\n\nN Best Evaluation"); 
     System.out.println(mNBestTaggerEvaluator.nBestHistogram()); 

    } 

    void parseCorpus(ObjectHandler<Tagging<String>> handler) throws IOException { 
     Parser<ObjectHandler<Tagging<String>>> parser = mCorpus.parser(); 
     parser.setHandler(handler); 
     Iterator<InputSource> it = mCorpus.sourceIterator(); 
     while (it.hasNext()) { 
      InputSource in = it.next(); 
      parser.parse(in); 
     } 
    } 

    class CorpusProfileHandler implements ObjectHandler<Tagging<String>> { 
     public void handle(Tagging<String> tagging) { 
      ++mTrainingSentenceCount; 
      mTrainingTokenCount += tagging.size(); 
      for (int i = 0; i < tagging.size(); ++i) 
       mTagSet.add(tagging.tag(i)); 
     } 
    } 

    class LearningCurveHandler implements ObjectHandler<Tagging<String>> { 
     Set<String> mKnownTokenSet = new HashSet<String>(); 
     int mUnknownTokensTotal = 0; 
     int mUnknownTokensCorrect = 0; 
     public void handle(Tagging<String> tagging) { 
      if (mEstimator.numTrainingTokens() > mToksBeforeEval 
       && mEstimator.numTrainingCases() % mSentEvalRate == 0) { 

       mTaggerEvaluator.handle(tagging); 
       mNBestTaggerEvaluator.handle(tagging); 
       mMarginalTaggerEvaluator.handle(tagging); 
       System.out.println("\nTest Case " 
            + mTaggerEvaluator.numCases()); 
       System.out.println("First Best Last Case Report"); 
       System.out.println(mTaggerEvaluator.lastCaseToString(mKnownTokenSet)); 
       System.out.println("N-Best Last Case Report"); 
       System.out.println(mNBestTaggerEvaluator.lastCaseToString(5)); 
       System.out.println("Marginal Last Case Report"); 
       System.out.println(mMarginalTaggerEvaluator.lastCaseToString(5)); 
       System.out.println("Cumulative Evaluation"); 
       System.out.print(" Estimator: #Train Cases=" 
           + mEstimator.numTrainingCases()); 
       System.out.println(" #Train Toks=" 
            + mEstimator.numTrainingTokens()); 
       ConfusionMatrix tokenEval = mTaggerEvaluator.tokenEval().confusionMatrix(); 
       System.out.println(" First Best Accuracy (All Tokens) = " 
            + tokenEval.totalCorrect() 
            + "/" + tokenEval.totalCount() 
            + " = " + tokenEval.totalAccuracy()); 
       ConfusionMatrix unkTokenEval = mTaggerEvaluator.unknownTokenEval(mKnownTokenSet).confusionMatrix(); 
       mUnknownTokensTotal += unkTokenEval.totalCount(); 
       mUnknownTokensCorrect += unkTokenEval.totalCorrect(); 
       System.out.println(" First Best Accuracy (Unknown Tokens) = " 
            + mUnknownTokensCorrect 
            + "/" + mUnknownTokensTotal 
            + " = " + (mUnknownTokensCorrect/(double)mUnknownTokensTotal)); 
      } 
      // train after eval 
      mEstimator.handle(tagging); 
      for (int i = 0; i < tagging.size(); ++i) 
       mKnownTokenSet.add(tagging.token(i)); 
     } 
    } 

    public static void main(String[] args) 
     throws Exception { 

     new EvaluatePos(args).run(); 
    } 
} 

我的问题是,如何创建使用它作为基于链CRF NER功能的HMM模型文件。

Lingpipe ../../models文件夹中的pos-en-general-brown.HiddenMarkovModel是如何创建的?

我使用BrownPosCorpus.java,BrownPosParser.java和EvaluatePos.java

我应该在哪里把下面的代码来创建POS HMM模型文件?

// write output to file 
        File modelFile = new File(args[1]); 
        AbstractExternalizable.compileTo(estimator,modelFile); 

可以对Ant文件进行哪些更改以创建pos hmm模型文件?

我想使用的POS HMM模型文件作为链CRF特征提取功能:

... 
    static final File POS_HMM_FILE 
        = new File("../../models/pos-en-general-brown.HiddenMarkovModel"); 
... 

最好的问候。

回答

0

要保存训练有素的HMM,只需像对待其他Java对象一样序列化训练对象。

您需要为HMM编写特征提取器回调函数。

您可能想要将HMM和CRF打包到一个新的可序列化对象中。基类有一个AbstractExternalizable,使它更容易和更向前兼容。