forked from zbw/pm20_bin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_sitemap.pl
173 lines (143 loc) · 3.83 KB
/
create_sitemap.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/bin/env perl
# nbt, 2022-03-25
# get an indexed xml sitemap of all pm20 pages
use strict;
use warnings;
use utf8;
use lib './lib';
use Data::Dumper;
use List::MoreUtils qw/uniq/;
use Path::Tiny;
use Web::Sitemap;
use Web::Sitemap::Url_patched;
$Data::Dumper::Sortkeys = 1;
my $sm = Web::Sitemap->new(
output_dir => '/pm20/web',
### Options ###
temp_dir => '/tmp',
loc_prefix => 'https://pm20.zbw.eu',
index_name => 'sitemap',
file_prefix => 'sitemap.',
# mark for grouping urls
##default_tag => 'my_tag',
# add <mobile:mobile/> inside <url>, and appropriate namespace (Google
# standard)
##mobile => 1,
# add appropriate namespace (Google standard)
##images => 1,
# additional namespaces (scalar or array ref) for <urlset>
##namespace => 'xmlns:some_namespace_name="..."',
# location prefix for files-parts of the sitemap (default is loc_prefix value)
## file_loc_prefix => 'http://my_domain.com',
# specify data input charset
charset => 'utf8',
move_from_temp_action => sub {
my ( $temp_file_name, $public_file_name ) = @_;
File::Copy::move( $temp_file_name, $public_file_name );
chmod 0664, $public_file_name;
}
);
# not used - would only make sense with enhanced prio
my @main_url_list = (
qw {
/about.de.html
/about.en.html
}
);
##$sm->add( \@main_url_list, tag => 'main' );
# work through all sets used in make, get all HTML urls (from file system) and
# add them
foreach my $set (qw/ default category co pe sh wa pdf /) {
print "$set ...\n";
my $url_list_ref = get_urls($set);
$sm->add( $url_list_ref, tag => $set );
}
# After calling finish() method will create an index file, which will link to files with URL's
$sm->finish;
# rough overview
print Dumper $sm;
###################
sub get_urls {
my $set = shift or die "param missing";
my @temp;
if ( $set eq 'pdf' ) {
## get pdf from about-pm20 only (not doc)
@temp = `cd /pm20/web ; find ./about-pm20 -name "*.pdf"`;
} elsif ( grep ( /^$set$/, qw/ co pe sh wa / ) ) {
## use prepared list of folders with documents
@temp = split( /\n/,
path("/pm20/data/folderdata/${set}_for_sitemap.lst")->slurp );
} else {
## get a list of .md files as used in make
@temp = `/bin/sh /pm20/web/mk/find_md.sh $set`;
}
my $url_list_ref;
## for some strange reason, lines in ??_for_sitemap.lst are duplicate
foreach my $line ( uniq @temp ) {
chomp($line);
$line = substr( $line, 1, );
$line =~ s/(.+)?\.md$/$1\.html/;
next unless $line =~ m/\.(html|pdf)$/;
my $entry = {
loc => $line,
priority => get_priority($line),
};
push( @$url_list_ref, $entry );
}
return $url_list_ref;
}
sub get_priority {
my $url = shift or die "param missing";
my $priority = '0.2';
my @url_prios = (
{
pattern => qr{^/about\...\.html$},
priority => '1.0',
},
{
pattern => qr{^/about-pm20/legal},
priority => '0.1',
},
{
pattern => qr{^/about-pm20/(?:hwwa|fs|wia|publication/testimonial)},
priority => '0.9',
},
{
pattern => qr{^/(?:doc/holding|film/about)},
priority => '0.9',
},
{
pattern => qr{^/category/(?:geo|subject|ware)/about},
priority => '0.9',
},
{
pattern => qr{^/folder/(?:co|pe)/[0-9]},
priority => '0.8',
},
{
pattern => qr{^/about-pm20/(?:about|links|publication)},
priority => '0.7',
},
{
pattern => qr{^/category/},
priority => '0.6',
},
{
pattern => qr{^/report},
priority => '0.3',
},
{
pattern => qr{^/error},
priority => '0.0',
},
);
foreach my $url_prio (@url_prios) {
my $pattern = $url_prio->{pattern};
my $prio = $url_prio->{priority};
if ( $url =~ $pattern ) {
$priority = $prio;
last;
}
}
return $priority;
}