forked from openedx/repo-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jiraspider.py
377 lines (314 loc) · 14.8 KB
/
jiraspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
"""
Spider to scrape JIRA for transitions in and out of states.
Usage: `scrapy runspider jiraspider.py -o states.json`
"""
from collections import namedtuple
import datetime
import json
import re
import dateutil.parser
from jira.client import JIRA
import scrapy
from scrapy.http import Request
from scrapy.conf import settings
from scrapy.selector import Selector
# Log only at INFO level -- change to 'DEBUG' for extremely verbose output.
settings.set('LOG_LEVEL', 'INFO')
SERVER = 'https://openedx.atlassian.net'
# Regex to match the duration field ("14d 22h 5m", "2h 33m", or "1m 10s")
DURATION_REGEX = re.compile(r'((?P<days>\d+?)d)?((?P<hours>\d+?)h)?((?P<minutes>\d+?)m)?((?P<seconds>\d+?)s)?')
# All states in the ospr jira ticket workflow
OSPR_STATES = [
'Needs Triage',
'Waiting on Author',
'Blocked by Other Work',
'Product Review',
'Community Manager Review',
'Open edX Community Review',
'Awaiting Prioritization',
'Engineering Review',
'Merged',
'Rejected',
]
# Sometimes, tickets have other states (perhaps they were moved from one project to the OSPR project).
# Map those states to equivalent OSPR workflow states.
STATE_MAP = {
'Open': 'Awaiting Prioritization',
'Ready for Grooming': 'Awaiting Prioritization',
'Groomed': 'Awaiting Prioritization',
'In Backlog': 'Awaiting Prioritization',
'Engineering Code Review': 'Engineering Review',
'Resolved': 'Merged',
'Verified': 'Merged',
'In Progress': 'Engineering Review',
'Blocked': 'Waiting on Author',
'Community Review': 'Open edX Community Review',
}
IssueFields = namedtuple('IssueFields', ['key', 'labels', 'issuetype'])
def extract_fields(issue):
"""
Extracts JIRA issue fields and returns an IssueFields object
"""
key = issue.key
labels = issue.fields.labels
issuetype = issue.fields.issuetype.name
return IssueFields(key, labels, issuetype)
def jira_issues(server_url, project_name):
"""
Get JIRA issue keys for a project on a server.
Returns a list of IssueFields such as:
[IssueFields(key='OSPR-456', labels=['Dedx'], issuetype=u'Sub-task'),
IssueFields(key='OSPR-458', labels=[], issuetype=u'Pull Request Review')]
"""
jira = JIRA({'server': server_url})
issues = jira.search_issues(f'project = {project_name}', maxResults=None)
return [extract_fields(iss) for iss in issues]
class IssueStateDurations(scrapy.Item):
"""Scrapy item class. Defines fields that will be json-serialized."""
# String: JIRA issue key, eg "OSPR-1"
issue = scrapy.Field()
# Dictionary: States: amount of time ([days, seconds] format) spent in that state, eg
# {"Waiting on Author": [0, 72180], "Needs Triage": [1, 38520]}
states = scrapy.Field()
# List: Labels present on the ticket, if any, eg ["TNL"]
labels = scrapy.Field()
# String: any debug information the parser outputs on this entry
debug = scrapy.Field()
# String: any error information the parser outputs on this entry
error = scrapy.Field()
# Resolution date - serialized datetime
resolved = scrapy.Field()
# State the ticket is currently in (if not resolved)
current = scrapy.Field()
# List [str, str]: Resolution transition of the issue, if resolved, eg
# ["Waiting on Author", "Merged"] indicates that the issue was merged from
# the "Waiting on Author" state.
resolution = scrapy.Field()
def __str__(self):
return f"Processed issue {self.issue}"
class JiraSpider(scrapy.Spider):
"""Scrapy spider for scraping JIRA"""
name = 'jiraspider'
default_time = datetime.timedelta(0)
onemin = datetime.timedelta(**{'minutes': 1})
# If you need to force authentication, you can define these here.
# http_user = 'ned'
# http_pass = 'not-my-real-password'
def start_requests(self):
"""Scrapy method. Starts the spider."""
issues = jira_issues(SERVER, 'OSPR')
requests = []
for issue in issues:
# TODO (potentially) ignore subtasks, or do something with them? (issue.issuetype)
request = Request(
f"{SERVER}/browse/{issue.key}?page=com.googlecode.jira-suite-utilities:transitions-summary-tabpanel"
)
request.meta['issue_key'] = issue.key
request.meta['labels'] = issue.labels
requests.append(request)
return requests
def parse(self, response):
"""
Parses a single response into an IssueStateDurations() object.
Returns the object itself.
(this function must be named as such for the scrapy spider to properly work)
"""
# Set up item to hold info from the response
item = IssueStateDurations()
item['issue'] = response.meta['issue_key']
item['labels'] = response.meta['labels']
item['debug'] = ''
item['error'] = ''
# if you need to figure out why this scraper is broken, dumping the
# response bodies is really helpful...
if 0:
with open("jira-{}.html".format(response.meta['issue_key']), "w") as f:
f.write(response.body)
# The data we need to parse is stored in a JavaScript string. Find it
# and parse it.
embedded = self.parse_embedded_ajax_data(response, "activity-panel-pipe-id")
states = {}
# Find each <div class="issue-data-block">
transitions = embedded.xpath('//div[@class="issue-data-block"]')
# Parse each transition block, pulling out the source status & how much time was spent in that status
for trans in self.clean_transitions(transitions, item):
(source_status, dest_status, duration) = trans
# need to account for the fact that states can be revisited, along different transition arrow
# parse the time into a datetime.timedelta object
duration_datetime = self.parse_duration(duration)
# ignore states that we spent less than one minute in (often this is just because transitioning
# through JIRA states is stupid, or botbro is stupid)
if source_status != 'Needs Triage' and duration_datetime < self.onemin:
item['debug'] += f"Ignoring state ({source_status}) of length {duration}. "
continue
# Add the amount of time spent in source status to any previous recorded time we've spent there
states[source_status] = states.get(source_status, self.default_time) + duration_datetime
self.validate_tdelta(
states[source_status],
item,
f'adding src status time: {duration_datetime}'
)
# Need to consider current state ('dest_status'), as well.
if not transitions:
# If we didn't find any transtions, generally this means that the ticket is newly-opened. But log
# so we can look at things anyway.
item['debug'] += "DEBUG: Could not find any transitions for key {}. ".format(item['issue'])
# Set current status by hand, since we know it.
dest_status = "Needs Triage"
# get "created" time (this is new, so doesn't have any transitions; it's still in needs triage)
trans_date = response.xpath('.//span[@id="created-val"]/time[@class="livestamp"]/text()').extract()[0].strip()
else:
# get the time this transition was executed -- in a terribly shitty format.
trans_date = transitions[-1].xpath('.//div[@class="action-details"]//time[@class="livestamp"]/text()').extract()[0].strip()
try:
last_execution_date = self.parse_last_execution_time(trans_date)
except ValueError:
# couldn't parse the last execution time for some reason. Log error and continue.
item['error'] += 'ERROR: failed to parse last execution date from date "{date}"\n'.format(
date=trans_date
)
last_execution_date = None
if dest_status in ['Merged', 'Rejected']:
# If the ticket's been resolved (merged or closed), store the resolution as a tuple of
# (source, result) so we can get not just resolution but previous state prior to resolution
item['resolution'] = (source_status, dest_status)
# Also store when it was resolved
if last_execution_date:
item['resolved'] = str(last_execution_date)
else:
# This is the current ticket state
item['current'] = dest_status
if last_execution_date:
current_duration = datetime.datetime.now() - last_execution_date
self.validate_tdelta(
current_duration,
item,
f'cd: now() minus last time, {last_execution_date}'
)
states[dest_status] = states.get(dest_status, self.default_time) + current_duration
self.validate_tdelta(
states[dest_status],
item,
f'getting dest status {dest_status}, duration {current_duration}'
)
# json-serialize each timedelta ('xx:yy', xx days, yy seconds) (skipping useconds)
item['states'] = {}
for (state, tdelta) in states.iteritems():
item['states'][state] = [tdelta.days, tdelta.seconds]
# If we didn't find any debugs or errors, remove before returning
if item['debug'] == '':
del item['debug']
if item['error'] == '':
del item['error']
return item
def parse_embedded_ajax_data(self, response, key):
"""
Find a JavaScript string indicated by a particular key, and unpack it
in gross ways.
Returns:
A Scrapy Selector for the data.
"""
marker = f'WRM._unparsedData["{key}"]='
for line in response.body.splitlines():
if line.startswith(marker):
break
else:
raise Exception(f"Couldn't find {key} in response")
# Extract the data from the line.
js_str = line.split('=', 1)[1].strip().rstrip(';')
# A Javascript string can have \' but JSON cannot.
js_str = js_str.replace(r"\'", "'")
# Now json.load can handle the string literal, to get the JS data.
js_data = json.loads(js_str)
# The JS data is JSON, so loads it to get the real data.
html_data = json.loads(js_data)
# Parse it as HTML, and return the Selector.
selector = Selector(text=html_data)
return selector
def clean_transitions(self, transitions, item):
"""
Parse each transition, discarding any transitions that go in/out of the "Verified" state
and cleaning up other random states that may have been introduced
See https://openedx.atlassian.net/browse/OSPR-369
"""
cleaned = []
for block in transitions:
try:
# Find the _relative_ (.//) changehistory div
trans = block.xpath('.//div[@class="changehistory action-body"]')
# Within each <div class="changehistory action-body"> find the table, with three entries (<td> elts).
# td1 has a nested table; this table's td1 is the source; td3 is the dest.
# td2 (of the outer table) is the duration.
source_status = trans.xpath('table/tr/td[1]/table/tr/td[1]/span/text()').extract()[0].strip()
dest_status = trans.xpath('table/tr/td[1]/table/tr/td[3]/span/text()').extract()[0].strip()
# Discard transitions that go in/out of the "Verified" state
if source_status == "Verified" or dest_status == "Verified":
continue
source_status = self.remap_states(source_status, item)
dest_status = self.remap_states(dest_status, item)
duration = trans.xpath('table/tr/td[2]/text()').extract()[0].strip()
# print('*'*10 + source_status + '->' + dest_status + '; ' + duration)
cleaned.append((source_status, dest_status, duration))
except Exception as err:
item['error'] += f"ERROR in clean_transactions: {err}"
continue
return cleaned
def remap_states(self, status, item):
"""
Cleans up messy data by remapping other board's states into OSPR states.
Logs debug messages for any states we don't know about.
"""
if status in STATE_MAP.keys():
return STATE_MAP[status]
elif status not in OSPR_STATES:
# emit error message if we find an unexpected state
item['error'] += f"ERROR: Found unexpected state '{status}'!"
return status
def parse_duration(self, duration):
"""
Parses the duration time that we scrape from the "Time in Source Status" field.
Will be in one of 3 forms: "14d 22h 5m", "2h 33m", or "1m 10s".
We discard the seconds, and turn the d/h/m into a datetime.timedelta
"""
duration = duration.replace(' ', '')
parts = DURATION_REGEX.match(duration)
td_dict = {}
for (name, value) in parts.groupdict().iteritems():
if value:
td_dict[name] = int(value)
return datetime.timedelta(**td_dict)
def parse_last_execution_time(self, etime):
"""
parses the "last execution time" field in JIRA, which is a terrible format, either:
if over a week ago:
- 02/Apr/15 10:11 AM
if within the past week:
- Saturday 9:23 AM
if yesterday:
- Yesterday 11:05 AM
if today:
- Today 9:09 AM
Returns a datetime.datetime object representing when the last execution time occured
"""
# dateutil.parser.parse will do all the formats we need except the "Yesterday"
if 'Today' in etime:
# For things that happend "Today", we can parse using the fuzzy flag.
return dateutil.parser.parse(etime, fuzzy=True)
if 'Yesterday' in etime:
ayer = datetime.date.today() - datetime.timedelta(days=1)
yesterday = '{0.month}/{0.day}/{0.year}'.format(ayer)
etime = etime.replace('Yesterday', yesterday)
# This is sometimes returning dates in the future when the day is
# specified as an abstract day such as "Wednesday"
parsed_dt = dateutil.parser.parse(etime)
# if it's a date in the future, take it back 7 days
if parsed_dt > datetime.datetime.now():
parsed_dt = parsed_dt + datetime.timedelta(**{'days': -7})
return parsed_dt
def validate_tdelta(self, tdelta, item, msg):
"""
Validates that the given timedelta is a positive quantity.
Logs a debug message if not.
"""
if tdelta < datetime.timedelta(0):
item['debug'] += f'DEBUG: negative time found. {msg}'