Wikipedia:Archiv/Alternative Benutzerstatistik/Programme/mkzsf
<syntaxhighlight lang="perl">
- !/usr/bin/perl
use Digest::MD5 qw(md5_base64);
$bs = "Linux"; # oder "Windows"
sub xmlunesc { my $text = shift; $text =~ s/</</sg; $text =~ s/>/>/sg; $text =~ s/'/'/sg; $text =~ s/"/"/sg; #" $text =~ s/&/&/sg; $text; }
sub xmlsiteinfo { while($xml =~ /<namespace key="(.*?)"(?: \/>|>(.*?)<\/namespace>)/sg) { $namespace{$2} = $1 if defined $2; } $xml = ""; }
sub xmlrevision { $xml =~ s/<revision>(.*?)<\/revision>//s; my $revcontent = $1; $rev = {}; xmlpage() unless defined $page; $xml = ""; while($revcontent =~ /<(id|timestamp|contributor|minor|comment|text)(?: xml:space="preserve"| type="(.*?)")*(?:\s*\/>|>(.*?)<\/\1>)/sg) { my ($tag, $type, $content) = ($1, $2, $3); $content = "" unless defined $content; if($tag =~ /^text/) { $rev->{"text"} = xmlunesc($content); } elsif($tag eq "contributor") { if($content =~ /<username>(.*?)<\/username>\s*<id>(.*?)<\/id>/) { $rev->{"user_text"} = xmlunesc($1); $rev->{"user"} = $2; } elsif($content =~ /<ip>(.*?)<\/ip>/) { $rev->{"user_text"} = xmlunesc($1); $rev->{"user"} = 0; } else { $rev->{"user_text"} = "_"; $rev->{"user"} = 0; } } elsif($tag eq "comment") { $rev->{"comment"} = xmlunesc($content); } elsif($tag eq "timestamp") { $content =~ /^(....)-(..)-(..)T(..):(..):(..)Z$/; $rev->{"timestamp"} = "$1$2$3$4$5$6"; } else { $rev->{$tag} = $content; } } revision(); }
sub xmlpage { while($xml =~ s/<(title|id|restrictions)(?:\s*\/>|>(.*?)<\/\1>)//s) { my ($tag, $content) = ($1, $2); if($tag eq "title") { $content = xmlunesc($content); if($content =~ /(.+?):(.+)/ && defined $namespace{$1}) { $page->{"namespace"} = $namespace{$1}; $content = $2; } else { $page->{"namespace"} = 0; } } $page->{$tag} = defined $content ? $content : ""; } }
sub revision { my $is_redirect = $rev->{"text"} =~ /^# ?redirect/i; my $len = length($rev->{"text"}); return unless defined $page->{"title"}; my $text_md5 = md5_base64($rev->{"text"}); my $loeschlink = $rev->{"text"} =~ /\[\[Wikipedia:(Löschkandidaten|Seiten, die gelöscht werden sollten)/s || $rev->{"text"} =~ /\{\{(msg:)?(vfd|Lösch|URV)/is; $page->{"title"} = "_" unless defined $page->{"title"}; $page->{"title"} =~ s/\s/_/sg; $rev->{"user_text"} =~ s/\s/_/sg; $rev->{"comment"} = "_" unless defined $rev->{"comment"}; $rev->{"comment"} =~ s/\s/_/sg; printf ZSF1 "%s %7i %i %i %s %20s %i %s\n", $rev->{"timestamp"}, $len, $is_redirect, $loeschlink, $text_md5, $rev->{"user_text"}, $page->{"namespace"}, $page->{"title"}; }
sub kategorien { my $links = $page->{"title"}; while($rev->{"text"} =~ /\[\[Kategorie:([^\|\]]*)/sg) { my $kat = $1; $kat =~ s/\s/_/sg; $links .= " " . $kat; } print KATLINKS "$links\n" if $page->{"namespace"}==0; print KATTREE "$links\n" if $page->{"namespace"}==14; }
sub sortiere { my $tmp = $ENV{"LC_ALL"}; $ENV{"LC_ALL"} = "C"; system "sort $_[0] /O $_[1]" if $bs eq "Windows"; system "sort -T ./sort-tmp -S 400M -o $_[1] $_[0]" if $bs eq "Linux"; $ENV{"LC_ALL"} = $tmp; unlink $_[0]; }
mkdir "sort-tmp" unless -d "sort-tmp"; open ZSF1, ">zsf1"; open KATLINKS, ">katlinks"; open KATTREE, ">kattree"; while(<>) { $xml .= $_; if(/^\s*<\/siteinfo>/) { xmlsiteinfo(); } next unless defined %namespace; if(/^\s*<\/revision>/) { xmlrevision(); } elsif(/^\s*<\/page>/) { kategorien(); $xml = ""; $page = undef; } } close ZSF1; close KATLINKS; close KATTREE; sortiere("zsf1", "zsf"); rmdir "sort-tmp"; </syntaxhighlight>