1.pom.xml中配置依赖jar包
<dependencies>
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>com.sun.media.imageio</groupId>
<artifactId>jai_imageio</artifactId>
<version>1.0_01</version>
<scope>system</scope>
<systemPath>${project.basedir}/lib/jai_imageio.jar</systemPath>
</dependency>
</dependencies>
2. 解压下载到.m2中的tess4j-3.2.1.jar包,找到tessdata文件夹:

3. 在github上下载中文训练包chi_sim.traineddata:

4. 相关文件拷贝到resources目录下:

5. 待抓取的tif图片,执行抓取:
package com.binge;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.util.LoadLibs;
import java.io.File;
public class App {
public static void main(String[] args) throws Exception {
// String fileName = "abc.tif";
// String fileName = "eurotext.tif";
// String fileName = "phototest.tif";
String fileName = "chi.tif";
File img = new File(App.class.getClassLoader().getResource(fileName).getPath());
ITesseract iTesseract = new Tesseract();
// iTesseract.setLanguage("eng");
iTesseract.setLanguage("chi_sim");
iTesseract.setDatapath(LoadLibs.extractTessResources("tessdata").getAbsolutePath());
System.out.println(iTesseract.doOCR(img));
}
}
执行前的tif图片(中文):

执行前的英文图片:

执行结果(中文):

执行结果(英文):
