-
Notifications
You must be signed in to change notification settings - Fork 0
/
student_thesis_download_count.py
77 lines (67 loc) · 3 KB
/
student_thesis_download_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import xlwt
import requests
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, SubElement
import re
def construct_xml(input_size,input_offset, before_date, after_date):
downloadsQuery = Element('downloadsQuery')
family = SubElement(downloadsQuery, 'family')
family.text = 'StudentThesis'
accessTimeBeforeDate = SubElement(downloadsQuery, 'accessTimeBeforeDate')
accessTimeBeforeDate.text = before_date
accessTimeAfterDate = SubElement(downloadsQuery, 'accessTimeAfterDate')
accessTimeAfterDate.text = after_date
size = SubElement(downloadsQuery, 'size')
size.text = str(input_size)
offset = SubElement(downloadsQuery, 'offset')
offset.text = str(input_offset)
navigationLink = SubElement(downloadsQuery, 'navigationLink')
navigationLink.text = 'true'
return ET.tostring(downloadsQuery, method='xml')
api_key = 'Your api key here'
base_api_url = 'https://pureserver/ws/api/516'
download_url = base_api_url + '/downloads'
detail_url = base_api_url + '/student-theses'
get_details = True # True: more details from theses record, but slow. False = id, title and count only. Significally faster.
before_date = '2020-02-05T00:00:00.001Z'
after_date = '2015-11-01T00:00:00.001Z'
headers = {'Content-Type': 'application/xml', 'api-key': api_key }
namespaces = {'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}
wb = xlwt.Workbook()
ws = wb.add_sheet('Student thesis download count')
row = 0
number_found = 1
size = 100
step = 0
total_download_count = 0
while number_found:
step +=1
count = step * size
offset = count - size
xml = construct_xml(size,offset,before_date,after_date)
response = requests.post(download_url, data=xml, headers=headers)
tree = ET.fromstring(response.content)
total_count = tree.findall('.//count',namespaces)[0].text
downloads = tree.findall('.//items/download',namespaces)
for download in downloads:
pureid = download.findall('.//pureId',namespaces)[0].text
title = download.findall('.//name/text',namespaces)[0].text
download_count = download.findall('.//downloadCount',namespaces)[0].text
uuid = download.findall('.//contentRef',namespaces)[0].attrib['uuid']
ws.write(row,0,pureid)
ws.write(row,1,title)
ws.write(row,2,download_count)
if(get_details):
detail_response = requests.get(detail_url + "/" + uuid, headers=headers)
student_thesis = ET.fromstring(detail_response.content)
faculty = student_thesis.findall('.//managingOrganisationalUnit',namespaces)[0].attrib['externalId']
award_date_year = student_thesis.findall('.//awardDate/year',namespaces)[0].text
ws.write(row,3,award_date_year)
ws.write(row,4,faculty)
row += 1
if row % 1000 == 0:
ws.flush_row_data()
number_found = len(downloads)
total_download_count += number_found
print("fetched "+str(total_download_count)+"/"+total_count)
wb.save('student-thesis-download-count.xls')