#!/usr/bin/zsh dump_date="latest" dump_file="dewiktionary-${dump_date}-pages-articles.xml" dump_url="https://dumps.wikimedia.org/dewiktionary/${dump_date}/${dump_file}.bz2" ./grab-dump.sh "${dump_url}" grep -A 10 -E -e "Deutsch Adjektiv Übersicht" "${dump_file}" > adj-stage1 grep -E -e "^\|Positiv=[a-z]" adj-stage1 > adj-stage2 sed -r "s/\|Positiv=(.*)/\1/g" adj-stage2 > adj-stage3 python -c """ import json with open('adj-stage3', 'r', encoding='utf8') as clean: with open('adjectives.json', 'w', encoding='utf-8') as ded: l = [a.strip() for a in set(clean.readlines())] l.sort() json.dump(l, ded, ensure_ascii=False) """ rm adj-stage*