From 31639b35e17732cf4c543194ec6d830da0178540 Mon Sep 17 00:00:00 2001 From: Eddy Pedroni Date: Mon, 17 Oct 2022 22:46:54 +0200 Subject: Added data scripts, gitignore --- data/adj-query.sh | 23 +++++++++++++++++++++++ data/grab-dump.sh | 14 ++++++++++++++ data/noun-query.sh | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+) create mode 100755 data/adj-query.sh create mode 100755 data/grab-dump.sh create mode 100755 data/noun-query.sh (limited to 'data') diff --git a/data/adj-query.sh b/data/adj-query.sh new file mode 100755 index 0000000..986c435 --- /dev/null +++ b/data/adj-query.sh @@ -0,0 +1,23 @@ +#!/usr/bin/zsh + +dump_date="20221001" +dump_file="dewiktionary-${dump_date}-pages-articles.xml" +dump_url="https://dumps.wikimedia.org/dewiktionary/${dump_date}/${dump_file}.bz2" + +./grab-dump.sh "${dump_url}" + +grep -A 10 -E -e "Deutsch Adjektiv Übersicht" "${dump_file}" > adj-stage1 +grep -E -e "^\|Positiv=[a-z]" adj-stage1 > adj-stage2 +sed -r "s/\|Positiv=(.*)/\1/g" adj-stage2 > adj-stage3 + +python -c """ +import json + +with open('adj-stage3', 'r', encoding='utf8') as clean: + with open('adjectives.json', 'w', encoding='utf-8') as ded: + l = [a.strip() for a in set(clean.readlines())] + l.sort() + json.dump(l, ded, ensure_ascii=False) +""" + +rm adj-stage* diff --git a/data/grab-dump.sh b/data/grab-dump.sh new file mode 100755 index 0000000..9a740cd --- /dev/null +++ b/data/grab-dump.sh @@ -0,0 +1,14 @@ +#!/usr/bin/zsh + +dump_url="$1" +dump_file=$(basename "${dump_url%.bz2}") + +if [[ ! -f "${dump_file}.bz2" && ! -f "${dump_file}" ]]; +then + wget "${dump_url}" +fi + +if [ ! -f "${dump_file}" ]; +then + bunzip2 -d "${dump_file}.bz2" +fi diff --git a/data/noun-query.sh b/data/noun-query.sh new file mode 100755 index 0000000..948bf02 --- /dev/null +++ b/data/noun-query.sh @@ -0,0 +1,36 @@ +#!/usr/bin/zsh + +dump_date="20221001" +dump_file="dewiktionary-${dump_date}-pages-articles.xml" +dump_url="https://dumps.wikimedia.org/dewiktionary/${dump_date}/${dump_file}.bz2" + +./grab-dump.sh "${dump_url}" + +grep -A 14 -E -e "Deutsch Substantiv Übersicht" "${dump_file}" > noun-stage1 +sed -r -z "s/\n}}/}}/g" noun-stage1 > noun-stage2 +sed -r -z "s/\n(\|[A-Z])/\1/g" noun-stage2 > noun-stage3 +grep -e "Deutsch Substantiv Übersicht" noun-stage3 > noun-stage4 + +python -c """ +with open('noun-stage4', 'r', encoding='utf8') as clean: + cleanLines = clean.readlines() + +# list[list[str]] +sp = [l.strip().rstrip('}}').lstrip('{{Deutsch Substantiv Übersicht|').split('|') for l in cleanLines] + +# list[dict[str:str]] +dicts = [{i.split('=')[0] : i.split('=')[1] for i in entry if len(i.split('=')) > 1} for entry in sp] + +with open('nouns.csv', 'w', encoding='utf-8') as nouns: + nouns.write('gender,nom-sin,nom-plu,akk-sin,akk-plu,dat-sin,dat-plu,gen-sin,gen-plu\n') + + for d in dicts: + try: + line = ','.join([d['Genus'], d['Nominativ Singular'], d['Nominativ Plural'], d['Akkusativ Singular'], d['Akkusativ Plural'], d['Dativ Singular'], d['Dativ Plural'], d['Genitiv Singular'], d['Genitiv Plural']]) + nouns.write(line + '\n') + except: + pass +""" + + +rm noun-stage* -- cgit v1.2.3