Added data scripts, gitignore

author: Eddy Pedroni <eddy@0xf7.com> 2022-10-17 22:46:54 +0200
committer: Eddy Pedroni <eddy@0xf7.com> 2022-10-17 22:46:54 +0200
commit: 31639b35e17732cf4c543194ec6d830da0178540 (patch)
tree: 638a2680aa9af58ac8def7f43f4066fb9269627c /data
parent: 92761bbfc8459e8cba94b91a3969524bf7bd26c4 (diff)
3 files changed, 73 insertions, 0 deletions
diff --git a/data/adj-query.sh b/data/adj-query.sh
new file mode 100755
index 0000000..986c435
--- /dev/null
+++ b/data/adj-query.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/zsh
+
+dump_date="20221001"
+dump_file="dewiktionary-${dump_date}-pages-articles.xml"
+dump_url="https://dumps.wikimedia.org/dewiktionary/${dump_date}/${dump_file}.bz2"
+
+./grab-dump.sh "${dump_url}"
+
+grep -A 10 -E -e "Deutsch Adjektiv Übersicht" "${dump_file}" > adj-stage1
+grep -E -e "^\|Positiv=[a-z]" adj-stage1 > adj-stage2
+sed -r "s/\|Positiv=(.*)/\1/g" adj-stage2 > adj-stage3
+
+python -c """
+import json
+
+with open('adj-stage3', 'r', encoding='utf8') as clean:
+    with open('adjectives.json', 'w', encoding='utf-8') as ded:
+        l = [a.strip() for a in set(clean.readlines())]
+        l.sort()
+        json.dump(l, ded, ensure_ascii=False)
+"""
+
+rm adj-stage*
diff --git a/data/grab-dump.sh b/data/grab-dump.sh
new file mode 100755
index 0000000..9a740cd
--- /dev/null
+++ b/data/grab-dump.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/zsh
+
+dump_url="$1"
+dump_file=$(basename "${dump_url%.bz2}")
+
+if [[ ! -f "${dump_file}.bz2" && ! -f "${dump_file}" ]];
+then
+    wget "${dump_url}"
+fi
+
+if [ ! -f "${dump_file}" ];
+then
+    bunzip2 -d "${dump_file}.bz2"
+fi
diff --git a/data/noun-query.sh b/data/noun-query.sh
new file mode 100755
index 0000000..948bf02
--- /dev/null
+++ b/data/noun-query.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/zsh
+
+dump_date="20221001"
+dump_file="dewiktionary-${dump_date}-pages-articles.xml"
+dump_url="https://dumps.wikimedia.org/dewiktionary/${dump_date}/${dump_file}.bz2"
+
+./grab-dump.sh "${dump_url}"
+
+grep -A 14 -E -e "Deutsch Substantiv Übersicht" "${dump_file}" > noun-stage1
+sed -r -z "s/\n}}/}}/g" noun-stage1 > noun-stage2
+sed -r -z "s/\n(\|[A-Z])/\1/g" noun-stage2 > noun-stage3
+grep -e "Deutsch Substantiv Übersicht" noun-stage3 > noun-stage4
+
+python -c """
+with open('noun-stage4', 'r', encoding='utf8') as clean:
+    cleanLines = clean.readlines()
+
+# list[list[str]]
+sp = [l.strip().rstrip('}}').lstrip('{{Deutsch Substantiv Übersicht|').split('|') for l in cleanLines]
+
+# list[dict[str:str]]
+dicts = [{i.split('=')[0] : i.split('=')[1] for i in entry if len(i.split('=')) > 1} for entry in sp]
+
+with open('nouns.csv', 'w', encoding='utf-8') as nouns:
+    nouns.write('gender,nom-sin,nom-plu,akk-sin,akk-plu,dat-sin,dat-plu,gen-sin,gen-plu\n')
+
+    for d in dicts:
+        try:
+            line = ','.join([d['Genus'], d['Nominativ Singular'], d['Nominativ Plural'], d['Akkusativ Singular'], d['Akkusativ Plural'], d['Dativ Singular'], d['Dativ Plural'], d['Genitiv Singular'], d['Genitiv Plural']])
+            nouns.write(line + '\n')
+        except:
+            pass
+"""
+
+
+rm noun-stage*
author	Eddy Pedroni <eddy@0xf7.com>	2022-10-17 22:46:54 +0200
committer	Eddy Pedroni <eddy@0xf7.com>	2022-10-17 22:46:54 +0200
commit	31639b35e17732cf4c543194ec6d830da0178540 (patch)
tree	638a2680aa9af58ac8def7f43f4066fb9269627c /data
parent	92761bbfc8459e8cba94b91a3969524bf7bd26c4 (diff)