diff options
Diffstat (limited to 'data/adj-query.sh')
-rwxr-xr-x | data/adj-query.sh | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/data/adj-query.sh b/data/adj-query.sh new file mode 100755 index 0000000..986c435 --- /dev/null +++ b/data/adj-query.sh @@ -0,0 +1,23 @@ +#!/usr/bin/zsh + +dump_date="20221001" +dump_file="dewiktionary-${dump_date}-pages-articles.xml" +dump_url="https://dumps.wikimedia.org/dewiktionary/${dump_date}/${dump_file}.bz2" + +./grab-dump.sh "${dump_url}" + +grep -A 10 -E -e "Deutsch Adjektiv Übersicht" "${dump_file}" > adj-stage1 +grep -E -e "^\|Positiv=[a-z]" adj-stage1 > adj-stage2 +sed -r "s/\|Positiv=(.*)/\1/g" adj-stage2 > adj-stage3 + +python -c """ +import json + +with open('adj-stage3', 'r', encoding='utf8') as clean: + with open('adjectives.json', 'w', encoding='utf-8') as ded: + l = [a.strip() for a in set(clean.readlines())] + l.sort() + json.dump(l, ded, ensure_ascii=False) +""" + +rm adj-stage* |