blob: f765e02193d00624e12858453a40fc161e2a9a64 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
#!/usr/bin/env bash
set -eou pipefail
readonly PDF_FILE="${1?Specify PDF file}"
if [[ ! -f "${PDF_FILE}" ]]; then
echo "File not found: ${PDF_FILE}"
exit 1
fi
if ! command -v sed &>/dev/null; then
echo "sed command not found"
exit 1
fi
if ! command -v tesseract >/dev/null 2>&1; then
echo "tesseract is not installed"
exit 1
fi
if ! command -v pdftk >/dev/null 2>&1; then
echo "pdftk is not installed"
exit 1
fi
if ! command -v pdftoppm >/dev/null 2>&1; then
echo "pdftoppm is not installed"
exit 1
fi
TMP_DIR=$(mktemp -d)
readonly TMP_DIR
if [[ ! "${TMP_DIR}" || ! -d "${TMP_DIR}" ]]; then
echo "Could not create temp dir"
exit 1
fi
echo "Temp dir: ${TMP_DIR}" >&2
function cleanup {
rm -rf "${TMP_DIR}"
echo "Deleted temp dir ${TMP_DIR}"
}
trap cleanup EXIT
echo "Extracting pages from PDF" >&2
pdftoppm -gray -png "${PDF_FILE}" "${TMP_DIR}"/page
rotations=""
for page in "${TMP_DIR}"/page-*.png; do
echo "" >&2
index=$(basename "${page}" | sed -E 's/^page-([0-9]+).png$/\1/')
echo "Running OCR on page #${index}: ${page}" >&2
rotate=$(tesseract "${page}" - --psm 0 | sed -nr '/^Rotate/s/Rotate: ([0-9]+)/\1/p')
case "${rotate}" in
90)
echo "Rotating 90 degrees" >&2
rotation=east
;;
180)
echo "Rotating 180 degrees" >&2
rotation=south
;;
270)
echo "Rotating 270 degrees" >&2
rotation=west
;;
0)
echo "No rotation needed" >&2
continue
;;
*)
echo "Unknown rotation: ${rotate}" >&2
exit 1
;;
esac
rotations="${rotations} ${index}${rotation}"
done
echo "" >&2
if [[ -z "${rotations}" ]]; then
echo "No pages require rotation" >&2
exit 0
fi
echo "Rotating pages: ${rotations}" >&2
TMP_PDF="${TMP_DIR}/rotated.pdf"
readonly TMP_PDF
# $rotations should not be doublequoted, it should be split to multiple arguments
pdftk "${PDF_FILE}" rotate ${rotations} output "${TMP_PDF}"
mv "${TMP_PDF}" "${PDF_FILE}"
echo "Done" >&2
|