2014-10-28 77 views
-1

我能够通过命令提示符使用新培训的tessedata(3.02版)获得正确的OCR输出,但是我希望在C#代码中使用DLL参考相同的输出。我尝试过使用tessnet2_32.dll引用,但它引发异常,所以如何通过C#代码使用或访问使用DLL参考的tesseract 3.02版训练好的tessedata?如何在C#中使用tesseract 3.02训练过的数据?

+0

此TESSERACT2.dll来自哪里?哪个项目? – nguyenq 2014-10-29 23:35:31

+0

我从网上下载了tessnet2_32.dll文件。 – 2014-10-30 05:25:35

+0

问题是与version3.02(受过训练的tessedata)和version2.0(tessnet2_32.dll)DLL版本不匹配。 – 2014-11-03 08:39:18

回答

0

这是用于Tesseract 2.04。你需要一个.NET 3.兼容3.02版本。

1
To access or use tesseract 3.02 trained data we have to create separate wrapper class like below. 

using System; 
using System.IO; 
using System.Diagnostics; 
using System.Drawing; 

/// <summary> 
/// Summary description for TesseractOCR 
/// </summary> 
/// 
namespace tesseractThree 
{ 
    public class TesseractOCR 
    { 
     public TesseractOCR() 
     { 
      // 
      // TODO: Add constructor logic here 
      // 
     } 

     private string commandpath; 
     private string outpath; 
     private string tmppath; 


     public TesseractOCR(string commandpath) 
     { 
      this.commandpath = commandpath; 
      tmppath = System.Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + @"\out.tif"; 
      outpath = System.Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData) + @"\out.txt"; 
     } 

     public string analyze(string filename,string lang,bool noLine) 
     { 
      string args = filename + " " + outpath.Replace(".txt", ""); 
      ProcessStartInfo startinfo; 
      if (noLine == true) 
      { 
       startinfo = new ProcessStartInfo(commandpath, args + " -l " + lang + " -psm 6"); 
      } 
      else 
      { 
       startinfo = new ProcessStartInfo(commandpath, args + " -l " + lang); 
      } 
      startinfo.CreateNoWindow = true; 
      startinfo.UseShellExecute = false; 
      Process.Start(startinfo).WaitForExit(); 

      string ret = ""; 
      using (StreamReader r = new StreamReader(outpath)) 
      { 
       string content = r.ReadToEnd(); 
       ret = content; 
      } 
      File.Delete(outpath); 
      return ret; 
     } 

     public string OCRFromBitmap(Bitmap bmp,string lang,bool noLine) 
     { 

      bmp.Save(tmppath, System.Drawing.Imaging.ImageFormat.Tiff); 
      string ret = analyze(tmppath,lang,noLine); 
      File.Delete(tmppath); 
      return ret; 
     } 
     /* public string OCRFromFile(string filename) 
     { 
      return analyze(filename); 
     }*/ 
    } 
} 

    //Usage of this class 
       string lang = "enc"; 
       Bitmap b = new Bitmap(@"D:\Image\enc.test_font.exp0.tif"); 
       TesseractOCR ocr = new TesseractOCR(@"C:\Program Files\Tesseract-OCR\tesseract.exe"); 
       string result = ocr.OCRFromBitmap(b, lang,true); 
       Label1.Text = result; 

OR Refer below link for more details. 
https://gist.github.com/yatt/915443 
+0

他的代码无法正常工作,但我明白了:) – 2015-09-20 11:12:12

0

使用tesseractengine3.dll我们可以使用tesseract v3.02训练过的数据,如下所示。

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Web; 
using System.Web.UI; 
using System.Web.UI.WebControls; 
using tesseract; 
using System.Drawing; 
using System.IO; 

public enum TesseractEngineMode : int 
{ 
    /// <summary> 
    /// Run Tesseract only - fastest 
    /// </summary> 
    TESSERACT_ONLY = 0, 

    /// <summary> 
    /// Run Cube only - better accuracy, but slower 
    /// </summary> 
    CUBE_ONLY = 1, 

    /// <summary> 
    /// Run both and combine results - best accuracy 
    /// </summary> 
    TESSERACT_CUBE_COMBINED = 2, 

    /// <summary> 
    /// Specify this mode when calling init_*(), 
    /// to indicate that any of the above modes 
    /// should be automatically inferred from the 
    /// variables in the language-specific config, 
    /// command-line configs, or if not specified 
    /// in any of the above should be set to the 
    /// default OEM_TESSERACT_ONLY. 
    /// </summary> 
    DEFAULT = 3 
} 

public enum TesseractPageSegMode : int 
{ 
    /// <summary> 
    /// Fully automatic page segmentation 
    /// </summary> 
    PSM_AUTO = 0, 

    /// <summary> 
    /// Assume a single column of text of variable sizes 
    /// </summary> 
    PSM_SINGLE_COLUMN = 1, 

    /// <summary> 
    /// Assume a single uniform block of text (Default) 
    /// </summary> 
    PSM_SINGLE_BLOCK = 2, 

    /// <summary> 
    /// Treat the image as a single text line 
    /// </summary> 
    PSM_SINGLE_LINE = 3, 

    /// <summary> 
    /// Treat the image as a single word 
    /// </summary> 
    PSM_SINGLE_WORD = 4, 

    /// <summary> 
    /// Treat the image as a single character 
    /// </summary> 
    PSM_SINGLE_CHAR = 5 
} 

public partial class importDLL : System.Web.UI.Page 
{ 

    private TesseractProcessor m_tesseract = null; 

    //private const string m_path = @"..\..\data\"; 
    private const string m_path = @"D:\tessdata-3.02\"; 
    private const string m_lang = "eng"; 


    protected void Page_Load(object sender, EventArgs e) 
    { 


     var image = System.Drawing.Image.FromFile(@"D:\Image\Capture1T.tif"); 


     m_tesseract = new TesseractProcessor(); 
     bool succeed = m_tesseract.Init(m_path, m_lang, (int)TesseractEngineMode.DEFAULT); 
     if (!succeed) 
     { 

     } 

     m_tesseract.SetVariable("tessedit_pageseg_mode", ((int)TesseractPageSegMode.PSM_SINGLE_LINE).ToString()); 
     m_tesseract.Clear(); 
     m_tesseract.ClearAdaptiveClassifier(); 
     string outValue= m_tesseract.Apply(image); 
     Response.Write(outValue); 
    } 


}