summarylogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristopher2022-02-18 19:07:13 -0500
committerChristopher2022-02-18 19:07:13 -0500
commitdf7d939c2e057628e9c5844069534c88eebea3d3 (patch)
tree924adf68034faf1c246969922f31de09e0964c7f
parentf9b5214588d323878aeb3a7aefbe5bc8b51d40dc (diff)
downloadaur-df7d939c2e057628e9c5844069534c88eebea3d3.tar.gz
Added a man page based on doc/jbig2enc.html
Patched pdf.py to use python 3; removed python 2 as optional dependency Added automake, libtool, and gzip (for man page) as makedepends
-rw-r--r--.SRCINFO18
-rw-r--r--PKGBUILD28
-rw-r--r--jbig2enc-pdfpy.patch12
-rw-r--r--jbig2enc.1201
4 files changed, 248 insertions, 11 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 7ffb18852856..469381e283c6 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -1,19 +1,27 @@
pkgbase = jbig2enc
pkgdesc = A JBIG2 image encoder
pkgver = 0.29
- pkgrel = 1
+ pkgrel = 2
url = https://github.com/agl/jbig2enc
arch = i686
arch = x86_64
license = Apache
+ makedepends = automake
+ makedepends = libtool
+ makedepends = gzip
depends = gcc-libs
depends = leptonica>=1.68
depends = libpng
depends = libjpeg
depends = libtiff
- optdepends = python2: for pdf.py
- source = https://github.com/agl/jbig2enc/archive/0.29.tar.gz
- md5sums = c3d1f7fd51e272301d5de436afb1fccb
+ optdepends = python: for pdf.py
+ provides = jbig2enc
+ conflicts = jbig2enc-git
+ source = https://github.com/agl/jbig2enc/archive/refs/tags/0.29.tar.gz
+ source = jbig2enc-pdfpy.patch
+ source = jbig2enc.1
+ sha256sums = bfcf0d0448ee36046af6c776c7271cd5a644855723f0a832d1c0db4de3c21280
+ sha256sums = 2614e02f9cc71d9b186ffaecf6abb4b270ad9ce43cb4d6284e9e96c4e4a44d06
+ sha256sums = c940124f102695872fae02b243e0dd99c05ecfb3ecef0a476b3e903a0db69a54
pkgname = jbig2enc
-
diff --git a/PKGBUILD b/PKGBUILD
index 4f8bbb34c3a0..2407950a7849 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -1,19 +1,32 @@
-# Maintainer: Vianney le Clément <vleclement AT gmail·com>
+# Maintainer: Christopher Hillenbrand <chillenb dot lists at gmail dot com>
+# Contributor: Doron Behar <doron dot behar at gmail dot com>
+# Contributor: Vianney le Clément <vleclement at gmail dot com>
+
pkgname=jbig2enc
pkgver=0.29
-pkgrel=1
+pkgrel=2
pkgdesc="A JBIG2 image encoder"
arch=('i686' 'x86_64')
url="https://github.com/agl/jbig2enc"
license=('Apache')
depends=('gcc-libs' 'leptonica>=1.68' 'libpng' 'libjpeg' 'libtiff')
-optdepends=('python2: for pdf.py')
-source=("https://github.com/agl/${pkgname}/archive/${pkgver}.tar.gz")
-md5sums=('c3d1f7fd51e272301d5de436afb1fccb')
+optdepends=('python: for pdf.py')
+provides=('jbig2enc')
+conflicts=('jbig2enc-git')
+makedepends=('automake' 'libtool' 'gzip')
+source=("https://github.com/agl/jbig2enc/archive/refs/tags/$pkgver.tar.gz"
+ "jbig2enc-pdfpy.patch"
+ "jbig2enc.1")
+sha256sums=('bfcf0d0448ee36046af6c776c7271cd5a644855723f0a832d1c0db4de3c21280'
+ '2614e02f9cc71d9b186ffaecf6abb4b270ad9ce43cb4d6284e9e96c4e4a44d06'
+ 'c940124f102695872fae02b243e0dd99c05ecfb3ecef0a476b3e903a0db69a54')
+
prepare() {
cd "${srcdir}/${pkgname}-${pkgver}"
- sed -i 's@^#!/usr/bin/python$@#!/usr/bin/env python2@' pdf.py
+ #sed -i 's@^#!/usr/bin/python$@#!/usr/bin/env python2@' pdf.py
+ patch --forward --strip=1 --input="${srcdir}/jbig2enc-pdfpy.patch"
+ cat "${srcdir}/jbig2enc.1" | gzip > "${srcdir}/jbig2enc.1.gz"
}
build() {
@@ -26,4 +39,7 @@ build() {
package() {
cd "${srcdir}/${pkgname}-${pkgver}"
make install DESTDIR="$pkgdir"
+ install -dm755 "$pkgdir/usr/share/man/man1"
+ install -m644 "${srcdir}/jbig2enc.1.gz" "$pkgdir/usr/share/man/man1"
}
+
diff --git a/jbig2enc-pdfpy.patch b/jbig2enc-pdfpy.patch
new file mode 100644
index 000000000000..76f084dac9cc
--- /dev/null
+++ b/jbig2enc-pdfpy.patch
@@ -0,0 +1,12 @@
+diff --unified --recursive --text jbig2enc-0.29.orig/pdf.py jbig2enc-0.29.new/pdf.py
+--- jbig2enc-0.29.orig/pdf.py 2017-01-30 12:27:36.000000000 -0500
++++ jbig2enc-0.29.new/pdf.py 2022-02-18 17:35:32.486793465 -0500
+@@ -162,7 +162,7 @@
+ pages.d.d['Count'] = str(len(page_objs))
+ pages.d.d['Kids'] = '[' + ' '.join([ref(x.id) for x in page_objs]) + ']'
+
+- print str(doc)
++ print(str(doc))
+
+
+ def usage(script, msg):
diff --git a/jbig2enc.1 b/jbig2enc.1
new file mode 100644
index 000000000000..c7ccdff75cad
--- /dev/null
+++ b/jbig2enc.1
@@ -0,0 +1,201 @@
+.\" Automatically generated by Pandoc 2.17.1.1
+.\"
+.\" Define V font for inline verbatim, using C font in formats
+.\" that render this, and otherwise B font.
+.ie "\f[CB]x\f[]"x" \{\
+. ftr V B
+. ftr VI BI
+. ftr VB B
+. ftr VBI BI
+.\}
+.el \{\
+. ftr V CR
+. ftr VI CI
+. ftr VB CB
+. ftr VBI CBI
+.\}
+.TH "JBIG2ENC" "1" "" "" ""
+.hy
+.SH jbig2enc: Documentation
+.PP
+Adam Langley <agl\[at]imperialviolet.org>
+.SS What is JBIG2
+.PP
+JBIG2 is an image compression standard from the same people who brought
+you the JPEG format.
+It compresses 1bpp (black and white) images only.
+These images can consist of \f[I]only\f[R] black and while, there are no
+shades of gray - that would be a grayscale image.
+Any \[dq]gray\[dq] areas must, therefore be simulated using black dots
+in a pattern called halftoning (http://en.wikipedia.org/wiki/Halftone).
+.PP
+The JBIG2 standard has several major areas:
+.IP \[bu] 2
+Generic region coding
+.IP \[bu] 2
+Symbol encoding (and text regions)
+.IP \[bu] 2
+Refinement
+.IP \[bu] 2
+Halftoning
+.PP
+There are two major compression technologies which JBIG2 builds on:
+arithmetic encoding (http://en.wikipedia.org/wiki/Arithmetic_coding) and
+Huffman encoding (http://en.wikipedia.org/wiki/Huffman_coding).
+You can choose between them and use both in the same JBIG2 file, but
+this is rare.
+Arithmetic encoding is slower, but compresses better.
+Huffman encoding was included in the standard because one of the
+(intended) users of JBIG2 were fax machines and they might not have the
+processing power for arithmetic coding.
+.PP
+jbig2enc \f[I]only\f[R] supports arithmetic encoding
+.SS Generic region coding
+.PP
+Generic region coding is used to compress bitmaps.
+It is progressive and uses a context around the current pixel to be
+decoded to estimate the probability that the pixel will be black.
+If the probability is 50% it uses a single bit to encode that pixel.
+If the probability is 99% then it takes less than a bit to encode a
+black pixel, but more than a bit to encode a white one.
+.PP
+The context can only refer to pixels above and to the left of the
+current pixel, because the decoder doesn\[aq]t know the values of any of
+the other pixels yet (pixels are decoded left-to-right, top-to-bottom).
+Based on the values of these pixels it estimates a probability and
+updates it\[aq]s estimation for that context based on the actual pixel
+found.
+All contexts start off with a 50% chance of being black.
+.PP
+You can encode whole pages with this and you will end up with a perfect
+reconstruction of the page.
+However, we can do better...
+.SS Symbol encoding
+.PP
+Most input images to JBIG2 encoders are scanned text.
+These have many repeating symbols (letters).
+The idea of symbol encoding is to encode what a letter \[lq]a\[rq] looks
+like and, for all the \[lq]a\[rq]s on the page, just give their
+locations.
+(This is lossy encoding)
+.PP
+Unfortunately, all scanned images have noise in them: no two
+\[lq]a\[rq]s will look quite the same so we have to group all the
+symbols on a page into groups.
+Hopefully each member of a given group will be the same letter,
+otherwise we might place the wrong letter on the page!
+These, very surprising, errors are called cootoots.
+.PP
+However, assuming that we group the symbols correctly, we can get great
+compression this way.
+Remember that the stricter the classifier, the more symbol groups
+(classes) will be generated, leading to bigger files.
+But, also, there is a lower risk of cootoots (misclassification).
+.PP
+This is great, but we can do better...
+.SS Symbol retention
+.PP
+Symbol retention is the process of compressing multi-page documents by
+extracting the symbols from all the pages at once and classifing them
+all together.
+Thus we only have to encoding a single letter \[lq]a\[rq] for the whole
+document (in an ideal world).
+.PP
+This is obviously slower, but generates smaller files (about half the
+size on average, with a decent number of similar typeset pages).
+.PP
+One downside you should be aware of: If you are generating JBIG2 streams
+for inclusion to a linearised PDF file, the PDF reader has to download
+all the symbols before it can display the first page.
+There is solution to this involing multiple dictionaries and symbol
+importing, but that\[aq]s not currently supported by jbig2enc.
+.SS Refinement
+.PP
+Symbol encoding is lossy because of noise, which is classified away and
+also because the symbol classifier is imperfect.
+Refinement allows us, when placing a symbol on the page, to encode the
+difference between the actual symbol at that location, and what the
+classifer told us was \[lq]close enough\[rq].
+We can choose to do this for each symbol on the page, so we don\[aq]t
+have to refine when we are only a couple of pixel off.
+If we refine whenever we a wrong pixel, we have lossless encoding using
+symbols.
+.SS Halftoning
+.PP
+jbig2enc doesn\[aq]t support this at all - so I will only mention this
+quickly.
+The JBIG2 standard supports the efficient encoding of halftoning by
+building a dictionary of halftone blocks (like the dictionaries of
+symbols which we build for text pages).
+The lack of support for halftones in G4 (the old fax standard) was a
+major weakness.
+.SS Some numbers
+.PP
+My sample is a set of 90 pages scanning pages from the middle of a
+recent book.
+The scanned images are 300dpi grayscale and they are being upsampled to
+600dpi 1-bpp for encoding.
+.IP \[bu] 2
+Generic encoding each page: 3435177 bytes
+.IP \[bu] 2
+Symbol encoding each page (default classifier settings): 1075185 bytes
+.IP \[bu] 2
+Symbol encoding with refinement for more than 10 incorrect pixels:
+3382605 bytes
+.SS Command line options
+.PP
+jbig2enc comes with a handy command line tool for encoding images.
+.TP
+\f[B]-d\f[R] | \f[B]--duplicate-line-removal\f[R]
+When encoding generic regions each scan line can be tagged to indicate
+that it\[aq]s the same as the last scanline - and encoding that scanline
+is skipped.
+This drastically reduces the encoding time (by a factor of about 2 on
+some images) although it doesn\[aq]t typically save any bytes.
+This is an option because some versions of jbig2dec (an open source
+decoding library) cannot handle this.
+.TP
+\f[B]-p\f[R] | \f[B]--pdf\f[R]
+The PDF spec includes support for JBIG2
+(Syntax\[->]Filters\[->]JBIG2Decode in the PDF references for versions
+1.4 and above).
+However, PDF requires a slightly different format for JBIG2 streams: no
+file/page headers or trailers and all pages are numbered 1.
+In symbol mode the output is to a series of files: symboltable and
+page-\f[I]n\f[R] (numbered from 0)
+.TP
+\f[B]-s\f[R] | \f[B]--symbol-mode\f[R]
+use symbol encoding.
+Turn on for scanned text pages.
+.TP
+\f[B]-t\f[R] <threshold>
+sets the fraction of pixels which have to match in order for two symbols
+to be classed the same.
+This isn\[aq]t strictly true, as there are other tests as well, but
+increasing this will generally increase the number of symbol classes.
+.TP
+\f[B]-T\f[R] <threshold>
+sets the black threshold (0-255).
+Any gray value darker than this is considered black.
+Anything lighter is considered white.
+.TP
+\f[B]-r\f[R] | \f[B]--refine\f[R] <tolerance>
+(requires \f[B]-s\f[R]) turn on refinement for symbols with more than
+tolerance incorrect pixels.
+(10 is a good value for 300dpi, try 40 for 600dpi).
+Note: this is known to crash Adobe products.
+.TP
+\f[B]-O\f[R] <outfile>
+dump a PNG of the 1 bpp image before encoding.
+Can be used to test loss.
+.TP
+\f[B]-2\f[R] or \f[B]-4\f[R]
+upscale either two or four times before converting to black and white.
+.TP
+\f[B]-S\f[R]
+Segment an image into text and non-text regions.
+This isn\[aq]t perfect, but running text through the symbol compressor
+is terrible so it\[aq]s worth doing if your input has images in it (like
+a magazine page).
+You can also give the \f[B]--image-output\f[R] option to set a filename
+to which the parts which were removed are written (PNG format).