wiki thumbs download script
#!/bin/bash
bzcat ruwikinews-20160111-pages-articles.xml.bz2 | grep -o -E "\[\[Файл:.*\]\]" | while read REPLY; do
f=$(echo -n "$REPLY" | grep -o -E "\[\[Файл:[^\|]*" | sed -e 's/\[\[Файл\://g' | sed -e 's/ /_/g')
s=$(echo -n "$REPLY" | grep -o -E "\|.{1,4}px" | sed -e 's/\|//g')
if [ -z "$s" ]
then
s="250px"
fi
m=$(echo -n "$f" | md5sum | awk '{print $1}')
m1=${m:0:1}
m2=${m:0:2}
url="http://upload.wikimedia.org/wikipedia/commons/thumb/$m1/$m2/${f}/${s}-${f}"
echo "$url"
#wget -nv -nc -x --restrict-file-names=nocontrol "$url"
url="http://upload.wikimedia.org/wikipedia/ru/thumb/$m1/$m2/${f}/${s}-${f}"
#wget -nv -nc -x --restrict-file-names=nocontrol "$url"
echo "$url"
done
# output of this script pass | sort | uniq > urls.lst
# wget -nv -nc -x --restrict-file-names=nocontrol -i urls.lst