From e0327b03beb1aa31cfecb4bd5945368020b4bd0a Mon Sep 17 00:00:00 2001 From: kev Date: Sat, 7 Dec 2019 13:00:54 +0800 Subject: [PATCH] update tesseract --- tesseract/Dockerfile | 48 ++++++++++---------------------------------- tesseract/README.md | 6 +++--- 2 files changed, 14 insertions(+), 40 deletions(-) diff --git a/tesseract/Dockerfile b/tesseract/Dockerfile index 1db17c6..ae50743 100644 --- a/tesseract/Dockerfile +++ b/tesseract/Dockerfile @@ -1,45 +1,19 @@ # -# Dockerfile for tesseract +# Dockerfile for tesseract CJK # -FROM debian:jessie +FROM alpine MAINTAINER kev RUN set -xe \ - && apt-get update \ - && apt-get install -y autoconf \ - build-essential \ - git \ - libcairo2 \ - libcairo2-dev \ - libgomp1 \ - libicu52 \ - libicu-dev \ - liblept4 \ - libleptonica-dev \ - libpango1.0-0 \ - libpango1.0-dev \ - libtool \ - && git clone https://github.com/tesseract-ocr/tesseract.git \ - && cd tesseract \ - && ./autogen.sh \ - && ./configure \ - && make install \ - && cd .. \ - && git clone https://github.com/tesseract-ocr/tessdata.git \ - && cd tessdata \ - && mv * /usr/local/share/tessdata/ \ - && cd .. \ - && apt-get purge --auto-remove -y autoconf \ - build-essential \ - git \ - libcairo2-dev \ - libicu-dev \ - libleptonica-dev \ - libpango1.0-dev \ - libtool \ - && rm -rf tesseract tessdata /var/cache/apk/* + && apk add --no-cache \ + tesseract-ocr \ + tesseract-ocr-data-chi_sim \ + tesseract-ocr-data-chi_tra \ + tesseract-ocr-data-jpn \ + tesseract-ocr-data-kor \ + && tesseract --version \ + && tesseract --list-langs ENTRYPOINT ["tesseract"] -CMD ["-h"] - +CMD ["--help"] diff --git a/tesseract/README.md b/tesseract/README.md index 0c2f66f..f0c4a35 100644 --- a/tesseract/README.md +++ b/tesseract/README.md @@ -14,9 +14,9 @@ Quick Start ----------- ``` -$ alias tesseract='docker run --rm -v `pwd`:/work -w /work vimagick/tesseract' -$ tesseract myscan.png out -$ cat out.txt +$ alias tesseract='docker run --rm -v `pwd`:/data -w /data vimagick/tesseract' +$ tesseract input.png output -l eng --psm 3 +$ cat output.txt ``` [1]: https://github.com/tesseract-ocr/tesseract