1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
|
#!/usr/bin/perl -w
# V1.1 6/2002 by Guido Ostkamp, <Guido.Ostkamp@t-online.de>
# * more dictd-like sorting with by_dict_sort()
#
# V1.0 4/2002 by Michael Bunk, <kleinerwurm@gmx.net>
# * does some sorting of a tei-file, but without parser (worked out on xml,
# but might be fine with sgml - i didn't try it)
# * problem: xml-parser would not tell position, would it?
# * so let's do without parser ;-)
# * using some undefined collation order
# * i wrote this to be able to merge double entrys
# * sort with an in-memory index of
# - keyword (first orth)
# - byte-start-offset of entry in tei file (end is found by
# outputting until </entry>)
# * we could try DB_File (see manpage), but let's try simple hash first
# key: <orth>-characters
# value: struct <entry>-offset
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, visit
# <http://www.gnu.org/licenses/gpl-2.0.txt>
use strict;
sub by_dict_sort {
my $x = $a;
my $y = $b;
$x =~ s/[^\d\w\s]//gi; # remove all non-alphanumeric and non-whitespace
$x =~ s/(.*)/\L$1/gi; # turn lowercase
$y =~ s/[^\d\w\s]//gi; # remove all non-alphanumeric and non-whitespace
$y =~ s/(.*)/\L$1/gi; # turn lowercase
$x cmp $y
}
my $file = shift;
unless(defined $file) {
print STDERR "\nteisort - sort tei file without using any xml parser (not safe)\n";
print STDERR "\n The inputfile is expected in XML or SGML TEI format, see http://www.tei-c.org/\n";
print STDERR " Output is on stdout.\n\n";
print STDERR " Usage: teisort <teifile>\n";
print STDERR " <teifile> : name of tei inputfile\n\n";
die
}
die "Can't find file \"$file\"" unless -r $file;
open HANDLE, "<".$file;
print STDERR "Generating index in memory...\n";
my ($headend, $footstart, $offset, $entry, %orths, $orth, $counter,
$tell_now, $tell_lastline);
# when we find "<entry" then we keep on reading until "</entry>",
# but save everything inside a $entry (we could keep the whole text
# in the hash!!! [this is just a notice for future reference
# for myself]). then we look for the "<orth*>$1</orth>" and use
# $1 as key
my $todo = "";
my $offsetOnLine = 0;
my $searchmode = 0;
# 0 = find end of header by finding "<entry"
# 1 = find beginning of entry by looking for "<entry"
# or find end by looking for "</body>"
# 2 = find end of entry by looking for "</entry>"
# 3 = nothing more to find, we are inside the footer, break;
# input is taken from $todo !
readfile: while (<HANDLE>) {
$tell_lastline = $tell_now;
$tell_now = tell;
$todo .= $_;
while ($todo ne "") {
# find end of header
if (($searchmode == 0) && ($todo =~ /<entry/i)) { #i for case insensitivity
$searchmode = 1;
my $eoffset = index "<entry", $todo;
$headend = $tell_lastline + $eoffset; print STDERR "headoffset: $headend\n";
$offsetOnLine -= $eoffset;
$todo = substr $todo, 0, $eoffset;
next
}
# find beginning of entry
if (($searchmode == 1) && ($todo =~ /<entry/i)) {
$searchmode = 2;
my $eoffset = index "<entry", $todo;
$offset = $tell_lastline + $eoffset;
$entry = substr $todo, $eoffset;
$counter++; if ($counter % 100 == 0) { print STDERR " $counter entries\n" }
$offsetOnLine -= $eoffset;
$todo = substr $todo, 0, $eoffset;
next
}
# find footer
if (($searchmode == 1) && ($todo =~ /<\/body>/i)) {
$searchmode = 3;
my $eoffset = index("<\/body>", $todo);
$footstart = $tell_lastline + $eoffset; print STDERR "footoffset: $footstart\n";
#$offsetOnLine -= $eoffset;
$todo = "";#substr($todo,0,$eoffset);
last readfile #exit that loop
}
# find end of entry
if (($searchmode == 2) && ($todo =~ /<\/entry>/i)) {
$searchmode = 1;
my $eoffset = index "</entry>", $todo;
$entry .= substr($todo, 0, 8+$eoffset) . "\n";
$offsetOnLine -= $eoffset;
$todo = substr $todo, 0, $eoffset;
# find orth
# /s modifies to treat $entry as single line
if ($entry =~ /<orth.*>(.*)<\/orth>/s) {
$orth = $1;
#print STDERR "orth: '$orth'\n";
}
else {
#print STDERR ".";
warn "no orth found in entry!!! there is something wrong! Entry is <$entry>"
}
# we may not overwrite any pair in the hash that we already have
# but since the entry-elements are read from the tei file again,
# the " *" is never seen in the output :)
while(defined $orths{$orth}) { $orth .= " " }
# save in hash
$orths{$orth}=$offset;
next
}
# else
$entry .= $todo;
$todo = "";
$offsetOnLine = 0
}
}
print STDERR " $counter entries\n";
###############################################################
print STDERR "Outputting sorted entries...\n";
# output header
my $header;
die unless sysseek HANDLE, 0, 0;
sysread HANDLE, $header, $headend;
print $header;
$counter = 0;
# this one simple sort call does the keywork!
foreach $orth (sort by_dict_sort keys %orths) {
$counter++; if ($counter % 100 == 0) { print STDERR " $counter entries\n" }
$offset = $orths{$orth};
#print STDERR "offset: $offset\n";
sysseek HANDLE, $offset, 0;
# output until </entry>
my $stopword = "</entry>";
my $stopwordpos = 0;
my $stopwordlength = length $stopword;
my $c;
do {
sysread HANDLE, $c,1;# maybe sysread with more than one byte would be faster...
print $c;
if ($c eq substr($stopword, $stopwordpos, 1)) { $stopwordpos++ }
else { $stopwordpos = 0 }
}
until $stopwordpos == $stopwordlength;
} # foreach
# output footer
my $footer;
die unless sysseek HANDLE, $footstart, 0;
my @stats = stat HANDLE;# fetch tei filesize
die unless @stats;
sysread(HANDLE, $footer, $stats[7]-$footstart);
print $footer;
print STDERR " $counter entries\n";
close HANDLE
|