-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpub_spork.py
executable file
·223 lines (206 loc) · 9.02 KB
/
pub_spork.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/usr/local/bin/python3
"""PubSpork is a utility that helps you manage and track publications.
It was created by @tnabtaf to help track papers that reference
@galaxyproject. However, it should be useful to manage any pubs that
reference anything.
See README.md for more.
"""
import argparse
import alert_sources
import lib_types
import report_formats
import generate_lib_report
import match_pubs
def get_args():
"""Parse and return the command line arguments."""
arg_parser = argparse.ArgumentParser(
description=(
"PubSpork helps manage and track publications. "
+ "It contains two main functions: "
+ "1) Supporting curation of newly reported publications. "
+ "2) Library reporting. "
+ "**Supporting Curation**: "
+ "The --match function is used to combine: "
+ " a) a DB of publications we have already looked at. "
+ " b) a library (currently in Zotero or CiteULike) of pubs "
+ " that have already been identified as relevant. "
+ " c) A set of publication alerts. "
+ "into an HTML page containing all newly report publications "
+ "and links to those publications to help curate them. "
+ "**Library Reporting**: "
+ "The --report function generated the selected library report."))
arg_parser.add_argument(
"--match", required=False, action="store_true",
help=(
"Match newly reported pubs with each other and with optional "
+ "libraries of already curated pubs. Generates an HTML page "
+ "that to use to curate the new pubs."))
arg_parser.add_argument(
"--report", required=False, action="store_true",
help=(
"Generate a library report."))
common_args = arg_parser.add_argument_group(
title="Common arguments", description=None)
common_args.add_argument(
"--libtype", required=True,
help=(
"What type of of 'already accepted pubs' library are we reading "
+ "in and updating? Options are "
+ lib_types.get_lib_types_as_text_list() + "."))
common_args.add_argument(
"--inputlibpath", required=True,
help=(
"Path to the library of already accepted pubs. This is typically "
+ "exported from the library service."))
common_args.add_argument(
"--onlineliburl", required=True,
help=(
"Base URL of the online version of the library of already "
+ "accepted pubs. Used to generate links."))
match_args = arg_parser.add_argument_group(
title="Match arguments", description=None)
match_args.add_argument(
"--email", required=False,
help=(
"Email account to pull new pub alerts from."))
match_args.add_argument(
"--mailbox", required=False,
help=(
"Optional mailbox within email account to limit notifications "
+ "from."))
match_args.add_argument(
"--imaphost", required=False,
help=(
"Address of --email's IMAP server. For GMail this is "
+ "imap.gmail.com."))
match_args.add_argument(
"--since", required=False,
help=(
"Only look at alerts from after this date. "
+ "Format: DD-Mmm-YYYY. Example: 01-Dec-2014."))
match_args.add_argument(
"--before", required=False,
help=(
"Optional. Only look at alerts before this date. "
+ "Format: DD-Mmm-YYYY. Example: 01-Jan-2015."))
match_args.add_argument(
"--sources", required=False,
help=(
"Which alert sources to process. Is either 'all' or a "
+ "comma-separated list (no spaces) from these sources: "
+ alert_sources.get_alert_sources_as_text_list()))
match_args.add_argument(
"--proxy", required=False,
help=(
"String to insert in URLs to access pubs through your paywall. "
+ "For Johns Hopkins, for example, this is: "
+ "'.proxy1.library.jhu.edu'"))
match_args.add_argument(
"--proxyseparator", required=False,
help=(
"Some proxies replace dots in the original pub URL with dashes. "
+ "Default is dots."),
choices=['dot', 'dash'], default="dot")
match_args.add_argument(
"--customsearchurl", required=False,
help=(
"URL to use for custom searches at your institution. The title "
+ "of the publication will be added to the end of this URL."))
match_args.add_argument(
"--knownpubsin", required=False,
help=(
"Path to existing known pubs DB. This is the list of publications "
+ "you have already looked at. Typically generated from the "
+ "previous PubSpork run. In TSV format."))
match_args.add_argument(
"--knownpubsout", required=False,
help="Where to put the new known pubs DB (in TSV format).")
match_args.add_argument(
"--okduplicatetitles", required=False,
help=(
"Text file containing duplicate titles that have been reviewed "
+ "and are in fact not duplicate titles. These will not get "
+ "reported as duplicates."))
match_args.add_argument(
"--excludesearches", required = False,
help=(
"Exclude searches look for matches that we want to exclude "
+ "from our results. These are useful because it is sometimes "
+ "easier to list each exclude search, each in a separate search "
+ "then to include all the excludes in each search (and "
+ "sometimes we can't make the search that long)."))
match_args.add_argument(
"--curationpage", required=False,
help=(
"Where to put the HTML page listing all the pubs. Required for "
+ " match runs."))
report_args = arg_parser.add_argument_group(
title="Report arguments", description=None)
report_args.add_argument(
"--reportformat", required=False,
help=(
"What format to generate the report in. Options are "
+ report_formats.get_formats_as_text_list()
+ "."))
arg_parser.add_argument(
"--journal", required=False, action="store_true",
help="Produce table showing number of papers in different journals.")
arg_parser.add_argument(
"--year", required=False, action="store_true",
help="Produce table showing number of papers published each year.")
report_args.add_argument(
"--tagyear", required=False, action="store_true",
help=(
"Produce table showing number of papers with each tag, "
+ "each year."))
report_args.add_argument(
"--yeartag", required=False, action="store_true",
help=(
"Produce table showing number of papers with each year, "
+ "each tag."))
report_args.add_argument(
"--tagcountdaterange", required=False, action="store_true",
help=(
"Produce table showing number of papers that were tagged with "
+ "each tag during a given time period. --entrystartdate and "
+ "--entryenddate parameters are required if --tagcountdaterange "
+ "is specified."))
report_args.add_argument(
"--pubsdaterange", required=False, action="store_true",
help=(
"Produce list of publications in the given date range. What is "
+ "included depends on the --reportformat. --entrystartdate and "
+ "--entryenddate parameters are required if --pubsdaterange "
+ "is specified."))
report_args.add_argument(
"--entrystartdate", required=False,
help=(
"--tagcountdaterange will report on papers with entry dates "
+ "greater than or equal to this date. Example: 2016-12-29. "))
report_args.add_argument(
"--entryenddate", required=False,
help=(
"--tagcountdaterange will report on papers with entry dates "
+ "less than or equal to this date. Example: 2017-01-29. "))
report_args.add_argument(
"--onlythesetags", required=False,
help=(
"Can either generate a report about all tags in the library, "
+ "or, only about a subset of tags. If this parameter is given "
+ "then only the tags listed in this file will be reported on. "
+ "List one tag per line."))
report_args.add_argument(
"--numtagcolumngroups", required=False, type=int, default=4,
help=(
"Specifies how many tags (and their counts) should be listed "
+ "in each row of a tag report. Default is 4."))
args = arg_parser.parse_args()
if args.match:
# split comma separated list of sources
args.sources = args.sources.split(",")
return args
args = get_args()
if args.match:
match_pubs.match_pubs(args)
elif args.report:
generate_lib_report.generate_lib_report(args)