forked from zbw/pm20_bin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_doc_attrib.pl
executable file
·146 lines (118 loc) · 3.7 KB
/
parse_doc_attrib.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/bin/env perl
# nbt, 2020-06-12
# Parse DocAttribute files from pm-opac sources, and save as a json
# datatructure
use strict;
use warnings;
use utf8;
use Data::Dumper;
use JSON;
use Path::Tiny;
use Readonly;
use YAML::Tiny;
binmode( STDIN, ":encoding(iso-8859-1)" );
binmode( STDOUT, ":utf8" );
$Data::Dumper::Sortkeys = 1;
Readonly my $DOCATTRIB_ROOT => path('/pm20/data/DocAttribute/');
Readonly my $DOCDATA_ROOT => path('/pm20/data/docdata/');
my %input = (
pe => [qw/ DocAttribute_P.txt /],
co => [qw/ DocAttribute_FI.txt DocAttribute_A.txt /],
sh => [qw/ DocAttribute_S.txt /],
wa => [qw/ DocAttribute_W.txt /],
);
my %doc_stat;
foreach my $collection ( keys %input ) {
my %data;
foreach my $fn ( @{ $input{$collection} } ) {
print "Read $fn\n";
my @lines = split( /\r\n/, $DOCATTRIB_ROOT->child($fn)->slurp );
foreach my $orig_line (@lines) {
next if $orig_line eq '';
my %entry;
# cleanup messy lines
my $line = fix_line($orig_line);
my @parts = split( / *\[\] */, $line );
warn "Wrong format1: $orig_line\n" if scalar(@parts) eq 0;
# check first part of the line
my ( $folder_nk, $doc_id, $date ) = split( / +/, $parts[0] );
if ( not $doc_id or $doc_id eq '' ) {
warn "Wrong format2: $orig_line\n";
next;
}
## remove leading hash character
$doc_id =~ s/^#(.+)/$1/;
if ($date) {
$date =~ s/d=(.*)/$1/;
}
if ( $date and $date ne '' ) {
$entry{d} = $date;
} else {
next if not( $parts[1] );
}
# check second (optional) part of the line
if ( $parts[1] ) {
my @fields = split( /\|/, $parts[1] );
warn "Wrong format3: $orig_line\n" if scalar(@fields) eq 0;
foreach my $field (@fields) {
my ( $code, $content );
if ( $field =~ m/([a-z])=(.*)/ ) {
$code = $1;
$content = $2;
if ( $content ne '' ) {
$entry{$code} = $content;
}
} else {
warn "Wrong format4: '$orig_line'\n";
}
}
}
# add entry for the document
$data{$folder_nk}{$doc_id} = \%entry;
$doc_stat{$collection}++;
}
}
#print Dumper \%data;
my $out = $DOCDATA_ROOT->child( $collection . "_docattr.json" );
$out->spew( encode_json( \%data ) );
my %code_stat;
foreach my $fid ( keys %data ) {
foreach my $did ( keys %{ $data{$fid} } ) {
foreach my $code ( keys %{ $data{$fid}{$did} } ) {
$code_stat{$code}++;
}
}
}
print Dumper \%code_stat;
}
print Dumper \%doc_stat;
##################
sub fix_line {
my $line = shift or die "param missing";
# missing first field code - assume t=
$line =~ s/(.+) \[\] (.[^=].+)/$1 \[\] t=$2/;
# missing last field code with "Tabelle"
$line =~ s/(.+ \[\] .+;) (Tabelle)$/$1 i=$2/;
$line =~ s/(.+ \[\] .+;) i(Tabelle)$/$1 i=$2/;
# replace field delimiter '; ' with '|', because semicolon occurs in texts
# (repeat for multiple occurances)
$line =~ s/(.+ \[\] .+?); ([a-z]=.+)/$1|$2/;
$line =~ s/(.+ \[\] .+?); ([a-z]=.+)/$1|$2/;
$line =~ s/(.+ \[\] .+?); ([a-z]=.+)/$1|$2/;
$line =~ s/(.+ \[\] .+?); ([a-z]=.+)/$1|$2/;
# replace missing field delimiter with '|'
# (repeat for multiple occurances)
$line =~ s/(.+ \[\] .+?) +([a-z]=.+)/$1|$2/;
$line =~ s/(.+ \[\] .+?) +([a-z]=.+)/$1|$2/;
$line =~ s/(.+ \[\] .+?) +([a-z]=.+)/$1|$2/;
$line =~ s/(.+ \[\] .+?) +([a-z]=.+)/$1|$2/;
# author fields
$line =~ s/(|x=)\((\d+)\)(.+)/$1$2$3/;
$line =~ s/(|v=)von\/by (.+)/$1$2/;
# individual errors
## uppercase code
$line =~ s/(.+ \[\] )T=(.+)/$1t=$2/;
## missing code
$line =~ s/(.+ \[\])=(.+)/$1 t=$2/;
return $line;
}