summaryrefslogtreecommitdiffstats
path: root/data/adj-query.sh
diff options
context:
space:
mode:
Diffstat (limited to 'data/adj-query.sh')
-rwxr-xr-xdata/adj-query.sh23
1 files changed, 23 insertions, 0 deletions
diff --git a/data/adj-query.sh b/data/adj-query.sh
new file mode 100755
index 0000000..986c435
--- /dev/null
+++ b/data/adj-query.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/zsh
+
+dump_date="20221001"
+dump_file="dewiktionary-${dump_date}-pages-articles.xml"
+dump_url="https://dumps.wikimedia.org/dewiktionary/${dump_date}/${dump_file}.bz2"
+
+./grab-dump.sh "${dump_url}"
+
+grep -A 10 -E -e "Deutsch Adjektiv Übersicht" "${dump_file}" > adj-stage1
+grep -E -e "^\|Positiv=[a-z]" adj-stage1 > adj-stage2
+sed -r "s/\|Positiv=(.*)/\1/g" adj-stage2 > adj-stage3
+
+python -c """
+import json
+
+with open('adj-stage3', 'r', encoding='utf8') as clean:
+ with open('adjectives.json', 'w', encoding='utf-8') as ded:
+ l = [a.strip() for a in set(clean.readlines())]
+ l.sort()
+ json.dump(l, ded, ensure_ascii=False)
+"""
+
+rm adj-stage*