data/noun-query.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

#!/usr/bin/zsh

dump_date="20221001"
dump_file="dewiktionary-${dump_date}-pages-articles.xml"
dump_url="https://dumps.wikimedia.org/dewiktionary/${dump_date}/${dump_file}.bz2"

./grab-dump.sh "${dump_url}"

grep -A 14 -E -e "Deutsch Substantiv Übersicht" "${dump_file}" > noun-stage1
sed -r -z "s/\n}}/}}/g" noun-stage1 > noun-stage2
sed -r -z "s/\n(\|[A-Z])/\1/g" noun-stage2 > noun-stage3
grep -e "Deutsch Substantiv Übersicht" noun-stage3 > noun-stage4

python -c """
import re

with open('noun-stage4', 'r', encoding='utf8') as clean:
    cleanLines = clean.readlines()

# list[list[str]]
sp = [l.strip().rstrip('}}').lstrip('{{Deutsch Substantiv Übersicht|').split('|') for l in cleanLines]

# list[dict[str:str]]
dicts = [{i.split('=')[0] : i.split('=')[1] for i in entry if len(i.split('=')) > 1} for entry in sp]

with open('common-nouns', 'r', encoding='utf-8') as common:
    commonNouns = [n.strip().lower() for n in common.readlines()]

with open('nouns.csv', 'w', encoding='utf-8') as nouns:
    nouns.write('gender,nom-sin,nom-plu,akk-sin,akk-plu,dat-sin,dat-plu,gen-sin,gen-plu\n')

    for d in dicts:
        try:
            if not re.match(r'[A-Za-z_]', d['Nominativ Singular'][0]):
                continue

            if d['Nominativ Singular'].strip().lower() not in commonNouns:
                continue

            line = ','.join([d['Genus'], d['Nominativ Singular'], d['Nominativ Plural'], d['Akkusativ Singular'], d['Akkusativ Plural'], d['Dativ Singular'], d['Dativ Plural'], d['Genitiv Singular'], d['Genitiv Plural']])
            nouns.write(line + '\n')
        except:
            pass
"""

rm noun-stage*