2011-09-30 45 views
0

我一直在尝试将HTk与sphinx4配合使用来进行语音识别应用。我以wav文件的形式提供我的输入,并且我使用提供了“Transcriber demo”和“Lattice Demo”的狮身人面像,但是输出几乎不可接受。所以我决定用Sphinx4引入HTK。但由此产生的输出似乎数英里之遥。我相信配置可以进一步调整..比我在做什么。我已经很好地查找了是否有与使用htk和sphinx4相关的ny教程。除了这个惊人的博客(http://nsh.nexiwave.com/2009/09/using-htk-models-in-sphinx4.html),我还没有发现任何其他。任何人都可以请帮我在这里来提高我的识别精度..使用htk进行非数字识别

我的配置文件是狮身人面像如下:

<?xml version="1.0" encoding="UTF-8"?> 

<!-- 
    Sphinx-4 Configuration file 
--> 

<!-- ******************************************************** --> 
<!-- an4 configuration file        --> 
<!-- ******************************************************** --> 

<config>   

    <!-- ******************************************************** --> 
    <!-- frequently tuned properties        --> 
    <!-- ******************************************************** --> 

    <property name="logLevel" value="WARNING"/> 

    <property name="absoluteBeamWidth" value="-1"/> 
    <property name="relativeBeamWidth" value="1E-80"/> 
    <property name="wordInsertionProbability" value="1E-36"/> 
    <property name="languageWeight"  value="8"/> 

    <property name="frontend" value="epFrontEnd"/> 
    <property name="recognizer" value="recognizer"/> 
    <property name="showCreations" value="false"/> 


    <!-- ******************************************************** --> 
    <!-- word recognizer configuration       --> 
    <!-- ******************************************************** --> 

    <component name="recognizer" type="edu.cmu.sphinx.recognizer.Recognizer"> 
     <property name="decoder" value="decoder"/> 
     <propertylist name="monitors"> 
      <item>accuracyTracker </item> 
      <item>speedTracker </item> 
      <item>memoryTracker </item> 
     </propertylist> 
    </component> 

    <!-- ******************************************************** --> 
    <!-- The Decoder configuration        --> 
    <!-- ******************************************************** --> 

    <component name="decoder" type="edu.cmu.sphinx.decoder.Decoder"> 
     <property name="searchManager" value="searchManager"/> 
    </component> 

    <!-- <component name="searchManager" 
     type="edu.cmu.sphinx.decoder.search.SimpleBreadthFirstSearchManager"> 
     <property name="logMath" value="logMath"/> 
     <property name="linguist" value="lexTreeLinguist"/> 
     <property name="pruner" value="trivialPruner"/> 
     <property name="scorer" value="threadedScorer"/> 
     <property name="activeListFactory" value="activeList"/> 
    </component> 

     <component name="activeList" 
      type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory"> 
     <property name="logMath" value="logMath"/> 
     <property name="absoluteBeamWidth" value="${absoluteBeamWidth}"/> 
     <property name="relativeBeamWidth" value="${relativeBeamWidth}"/> 
    </component> 

    --> 

    <component name="searchManager" 
       type="edu.cmu.sphinx.decoder.search.WordPruningBreadthFirstSearchManager"> 
     <property name="logMath" value="logMath"/> 
     <property name="linguist" value="lexTreeLinguist"/> 
     <property name="pruner" value="trivialPruner"/> 
     <property name="scorer" value="threadedScorer"/> 
     <property name="activeListManager" value="activeListManager"/> 
     <property name="activeListFactory" value="activeList"/> 
     <property name="growSkipInterval" value="0"/> 
     <property name="checkStateOrder" value="false"/> 
     <property name="buildWordLattice" value="false"/> 
     <property name="acousticLookaheadFrames" value="1.7"/> 
     <property name="relativeBeamWidth" value="${relativeBeamWidth}"/> 
    </component> 


    <component name="trivialPruner" 
       type="edu.cmu.sphinx.decoder.pruner.SimplePruner"/> 

    <component name="threadedScorer" 
       type="edu.cmu.sphinx.decoder.scorer.ThreadedAcousticScorer"> 
     <property name="frontend" value="${frontend}"/> 
    </component> 

    <component name="activeListManager" 
      type="edu.cmu.sphinx.decoder.search.SimpleActiveListManager"> 
     <propertylist name="activeListFactories"> 
      <item>standardActiveListFactory</item> 
      <item>wordActiveListFactory</item> 
      <item>wordActiveListFactory</item> 
      <item>standardActiveListFactory</item> 
      <item>standardActiveListFactory</item> 
      <item>standardActiveListFactory</item> 
     </propertylist> 
    </component> 

    <component name="standardActiveListFactory" 
      type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory"> 
     <property name="logMath" value="logMath"/> 
     <property name="absoluteBeamWidth" value="${absoluteBeamWidth}"/> 
     <property name="relativeBeamWidth" value="${relativeBeamWidth}"/> 
    </component> 

    <component name="wordActiveListFactory" 
      type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory"> 
     <property name="logMath" value="logMath"/> 
     <property name="absoluteBeamWidth" value="${absoluteWordBeamWidth}"/> 
     <property name="relativeBeamWidth" value="${relativeWordBeamWidth}"/> 
    </component> 

    <!-- ******************************************************** --> 
    <!-- The linguist configuration        --> 
    <!-- ******************************************************** --> 

    <component name="flatLinguist" 
       type="edu.cmu.sphinx.linguist.flat.FlatLinguist"> 
     <property name="logMath" value="logMath"/> 
     <property name="grammar" value="jsgfGrammar"/> 
     <property name="acousticModel" value="wsj"/> 
     <property name="wordInsertionProbability" 
       value="${wordInsertionProbability}"/> 
     <property name="languageWeight" value="${languageWeight}"/> 
     <property name="unitManager" value="unitManager"/> 
    </component> 


    <!-- ******************************************************** --> 
    <!-- The Grammar configuration        --> 
    <!-- ******************************************************** --> 

    <component name="jsgfGrammar" type="edu.cmu.sphinx.jsgf.JSGFGrammar"> 
     <property name="dictionary" value="dictionary"/> 
     <property name="grammarLocation" 
      value="resource:/edu/cmu/sphinx/demo/transcriber/"/> 
     <property name="grammarName" value="digits"/> 
    <property name="logMath" value="logMath"/> 
    </component> 

    <!-- ******************************************************** --> 
    <!-- The Dictionary configuration    
     <component name="dictionary" 
     type="edu.cmu.sphinx.linguist.dictionary.FastDictionary"> 
     <property name="dictionaryPath" 
        value="resource:/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/dict/cmudict.0.6d"/> 
     <property name="fillerPath" 
       value="resource:/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/noisedict"/> 
     <property name="addSilEndingPronunciation" value="false"/> 
     <property name="wordReplacement" value="&lt;sil&gt;"/> 
     <property name="unitManager" value="unitManager"/> 
    </component>    --> 
    <!-- ******************************************************** --> 



     <!-- ******************************************************** --> 
    <!-- The Dictionary configuration       --> 
    <!-- ******************************************************** --> 
    <component name="dictionary" 
     type="edu.cmu.sphinx.linguist.dictionary.FastDictionary"> 
     <property name="dictionaryPath" 
        value="file:C:\Raveesh\Softwares\apache-tomcat-6.0.32\apache-tomcat-6.0.32\bin\models\language\wsj\5100.dic"/> 
     <property name="fillerPath" 
       value="resource:/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/noisedict"/> 
     <property name="addSilEndingPronunciation" value="false"/> 
     <property name="wordReplacement" value="&lt;sil&gt;"/> 
     <property name="unitManager" value="unitManager"/> 
    </component> 

    <!-- ******************************************************** --> 
    <!-- The acoustic model configuration       --> 
    <!-- ******************************************************** --> 
    <component name="wsj" 
       type="edu.cmu.sphinx.linguist.acoustic.tiedstate.TiedStateAcousticModel"> 
     <property name="loader" value="wsjLoader"/> 
     <property name="unitManager" value="unitManager"/> 
    </component> 



     <component name="wsjLoader" type="edu.cmu.sphinx.linguist.acoustic.tiedstate.HTKLoader"> 
     <property name="logMath" value="logMath"/> 
     <property name="modelDefinition" value="hmmdefs"/> 
     <property name="unitManager" value="unitManager"/> 
    </component> 

<!-- 

    <component name="wsjLoader" type="edu.cmu.sphinx.linguist.acoustic.tiedstate.Sphinx3Loader"> 
     <property name="logMath" value="logMath"/> 
     <property name="unitManager" value="unitManager"/> 
     <property name="location" value="resource:/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz"/> 
    </component> 

-->  

    <!-- ******************************************************** --> 
    <!-- The unit manager configuration       --> 
    <!-- ******************************************************** --> 

    <component name="unitManager" 
     type="edu.cmu.sphinx.linguist.acoustic.UnitManager"/> 

    <!-- ******************************************************** --> 
    <!-- The live frontend configuration       --> 
    <!-- ******************************************************** --> 
     <!-- 
    <component name="epFrontEnd" type="edu.cmu.sphinx.frontend.FrontEnd"> 
     <propertylist name="pipeline"> 
      <item>audioFileDataSource </item> 
      <item>dataBlocker </item> 
      <item>speechClassifier </item> 
      <item>speechMarker </item> 
      <item>nonSpeechDataFilter </item> 
      <item>preemphasizer </item> 
      <item>windower </item> 
      <item>fft </item> 
      <item>melFilterBank </item> 
      <item>dct </item> 
      <item>liveCMN </item> 
      <item>featureExtraction </item> 
     </propertylist> 
    </component> 



--> 

<!-- the front end configuration using the HTK loader..  --> 

    <component name="epFrontEnd" type="edu.cmu.sphinx.frontend.FrontEnd"> 
     <propertylist name="pipeline"> 
     <item>streamHTKSource</item> 
     </propertylist> 
    </component> 

    <component name="streamHTKSource" type="edu.cmu.sphinx.frontend.util.StreamHTKCepstrum"> 
     <property name="cepstrumLength" value="39"/> 
    </component> 

    <!-- ******************************************************** --> 
    <!-- The frontend pipelines         --> 
    <!-- ******************************************************** --> 

    <component name="audioFileDataSource" type="edu.cmu.sphinx.frontend.util.AudioFileDataSource"/> 

    <component name="dataBlocker" type="edu.cmu.sphinx.frontend.DataBlocker"/> 

    <component name="speechClassifier" type="edu.cmu.sphinx.frontend.endpoint.SpeechClassifier"/> 

    <component name="nonSpeechDataFilter" 
       type="edu.cmu.sphinx.frontend.endpoint.NonSpeechDataFilter"/> 

    <component name="speechMarker" type="edu.cmu.sphinx.frontend.endpoint.SpeechMarker" /> 

    <component name="preemphasizer" 
       type="edu.cmu.sphinx.frontend.filter.Preemphasizer"/> 

    <component name="windower" 
       type="edu.cmu.sphinx.frontend.window.RaisedCosineWindower"> 
    </component> 

    <component name="fft" 
      type="edu.cmu.sphinx.frontend.transform.DiscreteFourierTransform"> 
    </component> 

    <component name="melFilterBank" 
     type="edu.cmu.sphinx.frontend.frequencywarp.MelFrequencyFilterBank"> 
    </component> 

    <component name="dct" 
      type="edu.cmu.sphinx.frontend.transform.DiscreteCosineTransform"/> 

    <component name="liveCMN" 
       type="edu.cmu.sphinx.frontend.feature.LiveCMN"/> 

    <component name="featureExtraction" 
       type="edu.cmu.sphinx.frontend.feature.DeltasFeatureExtractor"/> 

       <!-- Newly Added.. --> 
    <component name="streamDataSource" 
     type="edu.cmu.sphinx.frontend.util.StreamDataSource"> 
     <property name="sampleRate" value="16000"/> 
     <property name="bigEndianData" value="false"/> 
    </component> 


    <!-- ******************************************************* --> 
    <!-- monitors            --> 
    <!-- ******************************************************* --> 

    <component name="accuracyTracker" 
       type="edu.cmu.sphinx.instrumentation.BestPathAccuracyTracker"> 
     <property name="recognizer" value="${recognizer}"/> 
     <property name="showAlignedResults" value="true"/> 
     <property name="showRawResults" value="true"/> 
    </component> 

    <component name="memoryTracker" 
       type="edu.cmu.sphinx.instrumentation.MemoryTracker"> 
     <property name="recognizer" value="${recognizer}"/> 
    <property name="showSummary" value="false"/> 
    <property name="showDetails" value="false"/> 
    </component> 

    <component name="speedTracker" 
       type="edu.cmu.sphinx.instrumentation.SpeedTracker"> 
     <property name="recognizer" value="${recognizer}"/> 
     <property name="frontend" value="${frontend}"/> 
    <property name="showSummary" value="true"/> 
    <property name="showDetails" value="false"/> 
    </component> 


    <!-- ******************************************************* --> 
    <!-- Miscellaneous components        --> 
    <!-- ******************************************************* --> 

    <component name="logMath" type="edu.cmu.sphinx.util.LogMath"> 
     <property name="logBase" value="1.0001"/> 
     <property name="useAddTable" value="true"/> 
    </component> 

    <!-- ******************************************************** --> 
    <!-- The linguist configuration        --> 
    <!-- ******************************************************** --> 

    <component name="lexTreeLinguist" 
       type="edu.cmu.sphinx.linguist.lextree.LexTreeLinguist"> 
     <property name="logMath" value="logMath"/> 
     <property name="acousticModel" value="wsj"/> 
     <property name="languageModel" value="trigramModel"/> 
     <property name="dictionary" value="dictionary"/> 
     <property name="addFillerWords" value="false"/> 
     <property name="fillerInsertionProbability" value="1E-10"/> 
     <property name="generateUnitStates" value="false"/> 
     <property name="wantUnigramSmear" value="true"/> 
     <property name="unigramSmearWeight" value="1"/> 
     <property name="wordInsertionProbability" 
       value="${wordInsertionProbability}"/> 
     <property name="silenceInsertionProbability" 
       value="${silenceInsertionProbability}"/> 
     <property name="languageWeight" value="${languageWeight}"/> 
     <property name="unitManager" value="unitManager"/> 
    </component>  

    <!-- ******************************************************** --> 
    <!-- The Language Model configuration       --> 
    <!-- ******************************************************** --> 
    <component name="trigramModel" 
     type="edu.cmu.sphinx.linguist.language.ngram.SimpleNGramModel"> 
     <property name="location" 
      value="file:C:\Raveesh\Softwares\apache-tomcat-6.0.32\apache-tomcat-6.0.32\bin\models\language\wsj\5100.lm"/> 
     <property name="logMath" value="logMath"/> 
     <property name="dictionary" value="dictionary"/> 
     <property name="maxDepth" value="3"/> 
     <property name="unigramWeight" value=".7"/> 
    </component>  
</config> 

任何帮助将非常感激

回答

1

解码从音频流在sphinx4中还不支持HTK模型。 HTK前端配置需要不同。您只能按照博客中所述解码使用HTK提取的mfc文件。关于前端问题的细节也在博客评论中进行了描述。

+0

是的,我在博客中了解到这一点。目前我正在寻找sphinx4性能改进。但我似乎根本无法击中头部。假设我决定放弃HTK。你能否建议单独修改Sphinc配置文件?我最近开始玩他的光束宽度..将检查并看看它是什么yeilds ..感谢您的答复。 –

+0

我检查了http://cmusphinx.sourceforge.net/wiki/sphinx4:largevocabularyperformanceoptimization中提到的所有性能策略,但它们无法为转录器演示产生任何良好的识别结果。任何关于要采取的方法的帮助? –

+0

你好Raveesh。性能改进是一项复杂的任务,有多种可能的测试方法。首先要确定我们的实际表现是你目前的表现。调整光束有点没有意义,因为你没有适当的测试数据库和精度估计以及对当前性能的分析。一旦你有了测试,你可以在cmusphinx sourceforge论坛上分享它,以获得性能建议。另请参阅FAQ条目:http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor –