forked from zbw/pm20_bin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_mets.pl
308 lines (254 loc) · 8.25 KB
/
create_mets.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
#!/bin/perl
# nbt, 31.1.2018
# TODO extend download file name with collection
# traverses folder roots in order to create internal and external
# DFG-Viewer-suitable METS/MODS files per folder
# can be invoked either by
# - an extended folder id (e.g., pe/000012)
# - a collection id (e.g., pe)
# - 'ALL' (to (re-) create all collections)
use strict;
use warnings;
use utf8;
use lib './lib';
use Data::Dumper;
use Encode;
use HTML::Entities qw(encode_entities_numeric);
use HTML::Template;
use JSON;
use Path::Tiny;
use Readonly;
use ZBW::PM20x::Folder;
$Data::Dumper::Sortkeys = 1;
# use folder_root for persistent URI, and image_root for internal linking
# to image files (spares redirect for every file)
Readonly my $FOLDER_ROOT_URI => 'https://pm20.zbw.eu/folder/';
Readonly my $IMAGE_ROOT_URI => 'https://pm20.zbw.eu/folder/';
Readonly my $PDF_ROOT_URI => 'https://pm20.zbw.eu/pdf/';
Readonly my $METS_ROOT => path('../web/folder');
Readonly my $IMAGEDATA_ROOT => path('../data/imagedata');
# Since at least 2020, DFG viewer exclusively uses DEFAULT resolution!!
# see e.g. mail by Alexander Bigga to the mailing list, 29.9.2020
Readonly my %RES_EXT => (
DEFAULT => '_B.JPG',
MAX => '_A.JPG',
MIN => '_C.JPG',
);
Readonly my @LANGUAGES => qw/ en de /;
Readonly my @COLLECTIONS => qw/ co pe sh wa /;
my ( $docdata_file, $imagedata_file, $docdata_ref, $imagedata_ref, );
my $tmpl = HTML::Template->new( filename => 'html_tmpl/mets.tmpl' );
# check arguments
if ( scalar(@ARGV) == 1 ) {
if ( $ARGV[0] =~ m:^(co|pe|wa|sh)$: ) {
my $collection = $1;
mk_collection($collection);
} elsif ( $ARGV[0] =~ m:^(co|pe)/(\d{6}): ) {
my $collection = $1;
my $folder_nk = $2;
mk_folder( $collection, $folder_nk );
} elsif ( $ARGV[0] =~ m:^(sh|wa)/(\d{6},\d{6})$: ) {
my $collection = $1;
my $folder_nk = $2;
mk_folder( $collection, $folder_nk );
} elsif ( $ARGV[0] eq 'ALL' ) {
mk_all();
} else {
&usage;
}
} else {
&usage;
}
####################
sub mk_all {
foreach my $collection (@COLLECTIONS) {
mk_collection($collection);
}
}
sub mk_collection {
my $collection = shift or die "param missing";
# load input files
load_files($collection);
foreach my $folder_nk ( sort keys %{$imagedata_ref} ) {
mk_folder( $collection, $folder_nk );
}
}
sub mk_folder {
my $collection = shift || die "param missing";
my $folder_nk = shift || die "param missing";
my $folder = ZBW::PM20x::Folder->new( $collection, $folder_nk );
# check if folder dir exists
my $rel_path = $folder->get_folder_hashed_path();
my $full_path = $ZBW::PM20x::Folder::FOLDER_ROOT->child($rel_path);
if ( not -d $full_path ) {
die "$full_path does not exist\n";
}
# open files if necessary
# (check with arbitrary entry)
if ( not defined $imagedata_ref ) {
load_files($collection);
}
# TODO clearly wrong - change to public/intern pdfs?
my $pdf_url = $PDF_ROOT_URI . "$rel_path/${folder_nk}.pdf";
foreach my $type ( 'public', 'intern' ) {
# get document list, skip if empty
my $doclist_ref = $folder->get_doclist($type);
next unless $doclist_ref and scalar( @{$doclist_ref} ) gt 0;
foreach my $lang (@LANGUAGES) {
my $label = encode_entities_numeric( $folder->get_folderlabel($lang) );
# feedback mailto
my $mailto =
"mailto:p%72essema%70pe20@%7Ab%77.eu"
. "?subject=Feedback%20zu%20PM20%20$label"
. "&body=%0D%0A%0D%0A%0D%0A---%0D%0A"
. "https://pm20.zbw.eu/dfgview/$collection/$folder_nk";
my %tmpl_var = (
pref_label => $label,
uri => "$FOLDER_ROOT_URI$collection/$folder_nk",
folder_nk => $folder_nk,
file_grp_loop => build_file_grp( $type, $folder ),
phys_loop => build_phys_struct( $type, $folder ),
log_loop => build_log_struct( $type, $lang, $folder ),
link_loop => build_link( $type, $folder ),
pdf_url => $pdf_url,
mailto => $mailto,
);
$tmpl->param( \%tmpl_var );
# write mets file for the folder
write_mets( $type, $lang, $folder, $tmpl );
}
}
}
sub load_files {
my $collection = shift || die "param missing";
$imagedata_file = $IMAGEDATA_ROOT->child("${collection}_image.json");
$imagedata_ref = decode_json( $imagedata_file->slurp );
}
sub build_file_grp {
my $type = shift || die "param missing";
my $folder = shift || die "param missing";
my @file_grp_loop;
foreach my $res ( sort keys %RES_EXT ) {
my %entry = (
use => $res,
file_loop => build_res_files( $type, $folder, $res ),
);
push( @file_grp_loop, \%entry );
}
return \@file_grp_loop;
}
sub build_res_files {
my $type = shift || die "param missing";
my $folder = shift || die "param missing";
my $res = shift || die "param missing";
my $collection = $folder->{collection};
my $folder_nk = $folder->{folder_nk};
my %imagedata = %{ $imagedata_ref->{$folder_nk} };
# create a flat list of files
my @file_loop;
foreach my $doc_id ( @{ $folder->get_doclist($type) } ) {
my $page_no = 1;
foreach my $page ( @{ $imagedata{docs}{$doc_id}{pg} } ) {
# create url according to dir structure
my $img_url;
$img_url =
$IMAGE_ROOT_URI
. $folder->get_document_hashed_path($doc_id)
. "/PIC/$page$RES_EXT{$res}";
my %entry = (
img_id => get_img_id( $folder_nk, $doc_id, $page_no, $res ),
img_url => $img_url,
);
push( @file_loop, \%entry );
$page_no++;
}
}
return \@file_loop;
}
sub get_img_id {
my $folder_nk = shift || die "param missing";
my $doc_id = shift || die "param missing";
my $page_no = shift || die "param missing";
my $res = shift || die "param missing";
return "img_${folder_nk}_${doc_id}_${page_no}_" . lc($res);
}
sub build_phys_struct {
my $type = shift || die "param missing";
my $folder = shift || die "param missing";
my $folder_nk = $folder->{folder_nk};
my %imagedata = %{ $imagedata_ref->{$folder_nk} };
my @phys_loop;
my $i = 1;
foreach my $doc_id ( @{ $folder->get_doclist($type) } ) {
my $page_no = 1;
foreach my $page ( @{ $imagedata{docs}{$doc_id}{pg} } ) {
my @size_loop;
foreach my $res ( sort keys %RES_EXT ) {
push( @size_loop,
{ img_id => get_img_id( $folder_nk, $doc_id, $page_no, $res ) } );
}
my %entry = (
i => $i,
phys_id => "phys_$i",
page_uri => "https://pm20.zbw.eu/error/folder_page_not_addressable",
# TODO size_loop -> res_loop
size_loop => \@size_loop,
);
push( @phys_loop, \%entry );
$page_no++;
$i++;
}
}
return \@phys_loop;
}
sub build_log_struct {
my $type = shift || die "param missing";
my $lang = shift || die "param missing";
my $folder = shift || die "param missing";
my @log_loop;
foreach my $doc_id ( @{ $folder->get_doclist($type) } ) {
my $label = $folder->get_doclabel( $lang, $doc_id );
my %entry = (
document_id => "doc$doc_id",
label => $label,
type => 'Document',
);
push( @log_loop, \%entry );
}
return \@log_loop;
}
sub build_link {
my $type = shift || die "param missing";
my $folder = shift || die "param missing";
my $folder_nk = $folder->{folder_nk};
my %imagedata = %{ $imagedata_ref->{$folder_nk} };
# duplicates logic from build_phys_struct()!
my @link_loop;
my $i = 1;
foreach my $doc_id ( @{ $folder->get_doclist($type) } ) {
foreach my $page ( @{ $imagedata{docs}{$doc_id}{pg} } ) {
my %entry = (
document_id => "doc$doc_id",
phys_id => "phys_$i",
);
push( @link_loop, \%entry );
$i++;
}
}
return \@link_loop;
}
sub write_mets {
my $type = shift || die "param missing";
my $lang = shift || die "param missing";
my $folder = shift || die "param missing";
my $tmpl = shift || die "param missing";
my $hashed_path = $folder->get_folder_hashed_path();
my $mets_dir = $METS_ROOT->child($hashed_path);
$mets_dir->mkpath;
my $mets_file = $mets_dir->child("$type.mets.$lang.xml");
$mets_file->spew_utf8( $tmpl->output() );
}
sub usage {
print "Usage: $0 {folder-id}|{collection}|ALL\n";
exit 1;
}