aboutsummarylogtreecommitdiffstats
path: root/pdfrrr
blob: f765e02193d00624e12858453a40fc161e2a9a64 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env bash

set -eou pipefail

readonly PDF_FILE="${1?Specify PDF file}"

if [[ ! -f "${PDF_FILE}" ]]; then
  echo "File not found: ${PDF_FILE}"
  exit 1
fi

if ! command -v sed &>/dev/null; then
    echo "sed command not found"
    exit 1
fi

if ! command -v tesseract >/dev/null 2>&1; then
    echo "tesseract is not installed"
    exit 1
fi

if ! command -v pdftk >/dev/null 2>&1; then
    echo "pdftk is not installed"
    exit 1
fi

if ! command -v pdftoppm >/dev/null 2>&1; then
    echo "pdftoppm is not installed"
    exit 1
fi

TMP_DIR=$(mktemp -d)
readonly TMP_DIR

if [[ ! "${TMP_DIR}" || ! -d "${TMP_DIR}" ]]; then
    echo "Could not create temp dir"
    exit 1
fi

echo "Temp dir: ${TMP_DIR}" >&2

function cleanup {
    rm -rf "${TMP_DIR}"
    echo "Deleted temp dir ${TMP_DIR}"
}

trap cleanup EXIT

echo "Extracting pages from PDF" >&2
pdftoppm -gray -png "${PDF_FILE}" "${TMP_DIR}"/page

rotations=""

for page in "${TMP_DIR}"/page-*.png; do
    echo "" >&2
    index=$(basename "${page}" | sed -E 's/^page-([0-9]+).png$/\1/')
    echo "Running OCR on page #${index}: ${page}" >&2
    rotate=$(tesseract "${page}" - --psm 0 | sed -nr '/^Rotate/s/Rotate: ([0-9]+)/\1/p')
    case "${rotate}" in
        90)
            echo "Rotating 90 degrees" >&2
            rotation=east
            ;;
        180)
            echo "Rotating 180 degrees" >&2
            rotation=south
            ;;
        270)
            echo "Rotating 270 degrees" >&2
            rotation=west
            ;;
        0)
            echo "No rotation needed" >&2
            continue
            ;;
        *)
            echo "Unknown rotation: ${rotate}" >&2
            exit 1
            ;;
    esac
    rotations="${rotations} ${index}${rotation}"
done
echo "" >&2

if [[ -z "${rotations}" ]]; then
    echo "No pages require rotation" >&2
    exit 0
fi

echo "Rotating pages: ${rotations}" >&2
TMP_PDF="${TMP_DIR}/rotated.pdf"
readonly TMP_PDF
# $rotations should not be doublequoted, it should be split to multiple arguments
pdftk "${PDF_FILE}" rotate ${rotations} output "${TMP_PDF}"
mv "${TMP_PDF}" "${PDF_FILE}"

echo "Done" >&2