summarylogtreecommitdiffstats
path: root/PKGBUILD
blob: 79d87b9236d8c7c645e36e1ff9fb978f618ac424 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# Maintainer: Henry-ZHR <henry-zhr@qq.com>
_name=sentencepiece
pkgbase="${_name}"
pkgname=("${pkgbase}" "python-${pkgbase}")
pkgver=0.2.0
pkgrel=4
pkgdesc="Unsupervised text tokenizer for Neural Network-based text generation"
arch=('x86_64')
url="https://github.com/google/sentencepiece"
license=('Apache-2.0')
makedepends=('git' 'cmake'
             'abseil-cpp' 'gperftools' 'protobuf'
             'python' 'python-build' 'python-setuptools' 'python-wheel' 'python-installer')
checkdepends=('python-pytest')
_tag='17d7580d6407802f85855d2cc9190634e2c95624' # git rev-parse "v${pkgver}"
source=(
  "${_name}::git+${url}.git#tag=${_tag}"
  "fix-crash-in-unigram-model-training.patch::${url}/commit/d19ac45c919602cb041a86599d0593d24a150ac2.patch"
  "bump-cmake-minimum-required-version.patch::${url}/commit/e2127b9b932ba00811d5023c5ea69a12a857b244.patch"
)
sha512sums=(
  'SKIP'
  '644bc47fb3b90f2447ae9aac5ff2939fa6c9b3b0dc33550828b8517656f33fb1b41b2ebf9443e4b39a64bb963533c8d7a323b100d0b37671b070b7368f6fb1c7'
  'a4749510e7a4e5c72c60e67e903201d5f6b2224752059481613cb6e0e01c901d0bdbd83553ecc0b916f551e6f37342bab6bf298dfcdd5234129b1645299775b9'
)

pkgver() {
  git -C "${_name}" describe --tags | sed 's/^v//'
}

prepare() {
  cd "${_name}"

  git clean -dfx

  # See https://github.com/google/sentencepiece/pull/1088
  # Should fix test for v0.2.0
  git apply --verbose ../fix-crash-in-unigram-model-training.patch

  # Fix build for CMake 4.0.0+
  git apply --verbose ../bump-cmake-minimum-required-version.patch

  # Use shared libs for python module
  sed -i 's/libsentencepiece.a/libsentencepiece.so/g' python/setup.py
  sed -i 's/libsentencepiece_train.a/libsentencepiece_train.so/g' python/setup.py
}

build() {
  cd "${_name}"

  cmake -S . -B build \
    -DCMAKE_INSTALL_PREFIX=/usr \
    -DSPM_BUILD_TEST=ON \
    -DSPM_ENABLE_TCMALLOC=ON \
    -DSPM_ENABLE_SHARED=ON \
    -DSPM_PROTOBUF_PROVIDER=package \
    -DSPM_ABSL_PROVIDER=package \
    -Wno-dev
  cmake --build build --parallel "$(nproc)"

  mkdir build/root
  DESTDIR=build/root cmake --install build --prefix /
  cd python
  python -m build --wheel --no-isolation
}

check() {
  cd "${_name}"

  ctest --test-dir build --output-on-failure

  (
    cd python
    local python_version=$(python -c 'import sys; print("".join(map(str, sys.version_info[:2])))')
    export PYTHONPATH="${PWD}/build/lib.linux-${CARCH}-cpython-${python_version}"
    export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}${srcdir}/${_name}/build/root/lib"
    pytest test/
  )
}

package_sentencepiece() {
  depends=('gcc-libs' 'glibc' 'abseil-cpp' 'gperftools' 'protobuf')
  provides=('libsentencepiece.so' 'libsentencepiece_train.so')

  DESTDIR="${pkgdir}" cmake --install "${_name}/build"
}

package_python-sentencepiece() {
  pkgdesc="Python wrapper for SentencePiece"
  depends=("${pkgbase}=${pkgver}-${pkgrel}" 'gcc-libs' 'glibc' 'python')
  optdepends=('python-protobuf')

  cd "${_name}/python"
  python -m installer --destdir="${pkgdir}" dist/*.whl
}