blob: 450268db40437fd11a7b3f91a0c1c5e19a38d85a (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
#!/usr/bin/zsh
dump_date="latest"
dump_file="dewiktionary-${dump_date}-pages-articles.xml"
dump_url="https://dumps.wikimedia.org/dewiktionary/${dump_date}/${dump_file}.bz2"
./grab-dump.sh "${dump_url}"
grep -A 10 -E -e "Deutsch Adjektiv Übersicht" "${dump_file}" > adj-stage1
grep -E -e "^\|Positiv=[a-z]" adj-stage1 > adj-stage2
sed -r "s/\|Positiv=(.*)/\1/g" adj-stage2 > adj-stage3
python -c """
import json
with open('adj-stage3', 'r', encoding='utf8') as clean:
with open('adjectives.json', 'w', encoding='utf-8') as ded:
l = [a.strip() for a in set(clean.readlines())]
l.sort()
json.dump(l, ded, ensure_ascii=False)
"""
rm adj-stage*
|