summarylogtreecommitdiffstats
path: root/PKGBUILD
blob: 16877802cd3e8487505d600e45845f800a7e548e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Maintainer: Daniel Bershatsky <bepshatsky@yandex.ru>
# Contributor: Filip Graliński <filipg@amu.edu.pl>

: ${CARGO_HOME:=$SRCDEST/cargo-home}
: ${CARGO_TARGET_DIR:=target}
: ${RUSTUP_TOOLCHAIN:=stable}

_module="tokenizers"
_pkgname="python-$_module"
pkgname="$_pkgname"
pkgver=0.21.0
pkgrel=1
pkgdesc='Fast State-of-the-Art Tokenizers optimized for Research and Production'
url="https://github.com/huggingface/tokenizers"
license=('Apache-2.0')
arch=('i686' 'x86_64')

depends=('python')
makedepends=(
  'clang'
  'rust-bindgen'
  'python-build'
  'python-installer'
  'python-maturin'
  'python-setuptools-rust'
  'python-wheel'
)
checkdepends=(
  'python-datasets' # AUR
  #└─ 'python-huggingface-hub' # AUR
  #└─ 'python-multiprocess' # AUR
  'python-numpy'
  'python-pyarrow'
  'python-pytest'
  'python-requests'
  'python-setuptools-rust'
)

options=('!lto')

_pkgsrc="$_module-$pkgver"
_pkgext="tar.gz"
source=(
  "$_pkgsrc.$_pkgext"::"$url/archive/refs/tags/v$pkgver.$_pkgext"
  "norvig-big.txt"::"https://norvig.com/big.txt"
  "roberta.json"::"https://huggingface.co/roberta-large/raw/main/tokenizer.json"
)
sha256sums=(
  '841279ad797d575ed3cf31fc4f30e09e37acbd35028d30c51fc0879ef7ed4094'
  'fa066c7d40f0f201ac4144e652aa62430e58a6b3805ec70650f678da5804e87b'
  '847bbeab6174d66a88898f729d52fa8d355fafe1bea101cf960dd404581df70e'
)

_rust_env() {
  export CARGO_HOME CARGO_TARGET_DIR RUSTUP_TOOLCHAIN
  export GIT_DIR='.'
}

prepare() (
  _rust_env
  cd "$_pkgsrc/bindings/python"

  # fix typo
  sed -E -e 's@defaut@default@' -i Cargo.toml

  cargo update
  cargo fetch --locked --target "$(rustc -vV | sed -n 's/host: //p')"
)

build() (
  _rust_env
  cd "$_pkgsrc/bindings/python"
  cargo build --frozen --release
  python -m build --no-isolation --wheel
)

check() {
  cd "$_pkgsrc/bindings/python"

  # data
  install -Dm644 "$srcdir/roberta.json" "data/roberta.json"
  install -Dm644 "$srcdir/norvig-big.txt" "data/big.txt"
  head -100 data/big.txt > data/small.txt

  # test in venv
  python -m venv --system-site-packages test-env
  test-env/bin/python -m installer dist/*.whl
  test-env/bin/python -m pytest -s -v tests || :

  cargo test --no-default-features || :
}

package() {
  cd "$_pkgsrc/bindings/python"
  python -m installer --destdir="$pkgdir" dist/*.whl
}