summaryrefslogtreecommitdiffstats
path: root/data/adj-query.sh
blob: 986c43549c7e4be0efe6cccbf99fba4488a9de4c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#!/usr/bin/zsh

dump_date="20221001"
dump_file="dewiktionary-${dump_date}-pages-articles.xml"
dump_url="https://dumps.wikimedia.org/dewiktionary/${dump_date}/${dump_file}.bz2"

./grab-dump.sh "${dump_url}"

grep -A 10 -E -e "Deutsch Adjektiv Übersicht" "${dump_file}" > adj-stage1
grep -E -e "^\|Positiv=[a-z]" adj-stage1 > adj-stage2
sed -r "s/\|Positiv=(.*)/\1/g" adj-stage2 > adj-stage3

python -c """
import json

with open('adj-stage3', 'r', encoding='utf8') as clean:
    with open('adjectives.json', 'w', encoding='utf-8') as ded:
        l = [a.strip() for a in set(clean.readlines())]
        l.sort()
        json.dump(l, ded, ensure_ascii=False)
"""

rm adj-stage*