-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsynda-perf.py
executable file
·210 lines (190 loc) · 9.91 KB
/
synda-perf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python
"""Computes useful performance data from the Synda database. The beginning and ending dates and
times should be provided in a modified ISO 8601 format without letter separators, e.g.
'2019-01-25 13:04'. The third argument is a partial url, which is normally used to specify the
protocol and data node, e.g. gsiftp://vesg.ipsl.upmc.fr. But the % wildcard is permitted, and a
longer url may be used to narrow the coverage further."""
import os, sys, glob
from pprint import pprint
import sqlite3
#import debug, pdb
import datetime
global conn, curs
def setup():
"""Initializes the connection to the database, etc."""
global conn, curs
# normal:
conn = sqlite3.connect('/var/lib/synda/sdt/sdt.db')
# test on a temporary copy of the database:
#conn = sqlite3.connect('~/db/sdt.db')
curs = conn.cursor()
def finish():
"""Closes connections to databases, etc."""
global conn, curs
conn.commit()
conn.close()
def str2time( date ):
"""Given a date string such as '2019-01-25 13:04' or '2019-01-25 13:04:13.922788',
this function returns a datetime object representing the date."""
FMT_min = '%Y-%m-%d %H:%M'
FMT_sec = '%Y-%m-%d %H:%M:%S'
FMT_frac = '%Y-%m-%d %H:%M:%S.%f'
try:
return datetime.datetime.strptime( date, FMT_frac )
except ValueError:
try:
return datetime.datetime.strptime( date, FMT_sec )
except ValueError:
return datetime.datetime.strptime( date, FMT_min )
def downloading_intervals( startin, stopin, file_intervals ):
"""Returns active_time: the amount of time, in seconds, within (start,stop) in which at least
one of the files described by 'file_intervals' was being downloaded.
Input parameters:
- start and stop define the overal time interval.
The times provided are strings suitable for str2time, e.g. "2019-01-25 13:04:00.123"
- The list 'file_intervals' is a list of tuples from the database, of the form
(start_date, end_date, <ignored>). Each tuple defines a time interval in which one file was
being downloaded.
"""
start = str2time( startin )
stop = str2time( stopin )
file_ints = [ ( str2time(file_int[0]), str2time(file_int[1]) ) for file_int in file_intervals ]
file_ints.sort( key=(lambda x: x[0]) ) # sort by each file's start_date.
# Because file_ints is sorted, the following computes intervals in a sorted order, sorted by
# the bottom (start) time. Each file_int either extends an interval at the top, or starts a
# new interval above the top of the previous interval. That is, the intervals are disjoint,
# and ordered by the top (stop) time as well as the bottom time.
# intervals = [] # not used, but may be useful for debugging
bot = file_ints[0][0]
top = file_ints[0][1]
active_time = 0
for file_int in file_ints:
if file_int[0]<=top: # extend present interval
top = max( top, file_int[1] )
else: # new interval, after all previous intervals and previous files' end_date.
active_time += (top-bot).total_seconds()
# intervals.append( (bot,top) ) # save the last interval (not used)
bot = file_int[0]
top = file_int[1]
if top<stop:
active_time += (top-bot).total_seconds()
# intervals.append( (bot,top) ) # save the last interval (not used)
return active_time
def url_hdr( url ):
"""url header, i.e. the protocol and data node but no more of the url."""
upto_third_slash = url[: url.find('/', 2+url.find('//'))]
return upto_third_slash
def url_hdrs( start, stop, server, method='aggregate' ):
"""Returns url headers (with protocol and data node) for transfers with times between
'start' and 'stop', and a specified server. These are the same transfers as for the
corresponding call of perf_data()."""
# If the SQL command is changed in perf_data, then this should be changed to match:
cmd = ("SELECT url FROM file WHERE start_date>='{0}' AND " +\
"end_date<='{1}' AND url LIKE '{2}%' AND " +\
"status='done' AND size IS NOT NULL").format(start, stop, server)
curs.execute( cmd )
results = curs.fetchall()
return list(set( [ url_hdr(r[0]) for r in results] ))
def perf_data( start, stop, server, method='aggregate' ):
"""Returns performance data for transfers with times between 'start' and 'stop', and a
specified server.
The times should be in a modified ISO 8601 format without letter separators, e.g.
'2019-01-25 13:04'. The server - both the data node and the protocol - is specified as the
first characters of the url, e.g. 'gsiftp://esgf1.dkrz.de' or 'http://esgf1.dkrz.de'.
Optionally you may provide a method argument to specify how the rate is to be computed."""
cmd = ("SELECT start_date, end_date, size FROM file WHERE start_date>='{0}' AND " +\
"end_date<='{1}' AND url LIKE '{2}%' AND " +\
"(status='done' OR status='published') AND size IS NOT NULL").format(start, stop, server)
# ...For more accuracy, I could include files overlapping the (start,stop) boundary, i.e.
# end_date>{0} and start_date<{1}. Then I would have to reduce the file size in proportion
# to the amount of the file's download time which is within (start,stop).
curs.execute( cmd )
results = curs.fetchall()
sizes = [ size for (start_date,end_date,size) in results ]
Nfiles = len(sizes)
totsize = sum(sizes)
if totsize==0:
return None,None,None,None,None
avgsize = totsize/Nfiles/1024./1024
spf = 0 # don't want to compute it in non-default cases
if method=='aggregate': # (bytes downloaded)/(downloading time). Takes parallelism
# into account, and doesn't count inactive time in (start,stop ).
active_time = downloading_intervals( start, stop, results )
if active_time>0:
retrate = totsize/active_time/1024/1024.
retsize = totsize/1024/1024/1024.
else:
retrate = 0
retsize = 0
spf = active_time/len(sizes)
elif method=='aggregate-crude': # simply (bytes downloaded)/(stop-start). Takes parallelism
# into account, but it's off, sometimes way off, if there are inactive periods.
delta = str2time(stop) - str2time(start)
retrate = totsize/delta.total_seconds()/1024/1024.
retsize = totsize/1024/1024/1024.
elif method=='seqsize': # size-weighted method, but based on separate rates for each file,
# thus like "synda metric" except that the average is weighted by file size.
# In other words, compute time as if everything were sequential.
deltas = [ str2time(end_date)-str2time(start_date) for (start_date,end_date,size) in results ]
delta = datetime.timedelta(0) # sum() doesn't work on timedelta objects
for dl in deltas:
delta += dl
retrate = totsize/delta.total_seconds()/1024/1024.
retsize = totsize/1024/1024/1024.
elif method=='arith': # simple arithmetic average
rates = [ size/(str2time(end_date)-str2time(start_date)).total_seconds()
for (start_date,end_date,size) in results if size!=0 ]
retrate = sum(rates)/1024/1024./len(rates)
retsize = totsize/1024/1024/1024.
else: # the simple arithmetic average which Synda does, but still restricted to the
# protocol+server and the date range. This is a bit less precise than arith because
# the 'rate' column in the database has been rounded to an integer.
cmd = ("SELECT avg(rate) FROM file WHERE status='done' AND rate IS NOT NULL AND "+\
"start_date>='{0}' AND end_date<='{1}' AND size IS NOT NULL AND "+\
"url LIKE '{2}%'").format(start,stop,server)
curs.execute( cmd )
results = curs.fetchall()
retrate = results[0][0]/1024/1024.
retsize = totsize/1024/1024/1024.
return round(retrate,4), round(spf,4), round(retsize,4), round(avgsize,4), Nfiles
if __name__ == '__main__':
setup()
print "args=", sys.argv
if len( sys.argv ) < 3:
print "provide start time, stop time, and server in the form of"
print " '2019-01-25 13:04' '2019-01-25 14:04' 'gsiftp://esgf1.umr-cnrm.fr'"
print " You can use a % wildcard character when specifying the server."
print " You can use a T instead of a space between the date and time."
else:
if False: # for tests:
for method in ['aggregate','aggregate-crude','seqsize','arith','synda']:
rate,size= perf_data( sys.argv[1], sys.argv[2], sys.argv[3], method )
if rate is None:
print "No data downloaded"
else:
print method, ' ',rate, "MiB/s", size, "GiB"
else:
# Times with a T work better in scripts, e.g. '2019-01-25T13:04'.
# The Synda database uses a space between the date and time, e.g.
# '2019-01-25 13:04'
start = sys.argv[1].replace('T',' ')
stop = sys.argv[2].replace('T',' ')
if len(sys.argv)>=4:
server = sys.argv[3]
else:
server = '%'
rate,spf,size,avgsize,Nfiles = perf_data( start, stop, server )
if rate is None:
print "No data downloaded"
else:
uhs = url_hdrs( start, stop, server )
uhs.sort()
print 'rate',rate, "MiB/s Nfiles",Nfiles," size", size, "GiB", "avg size", avgsize, "MiB", uhs
if len(uhs)>1:
for uh in uhs:
rate,spf,size,avgsize,Nfiles = perf_data( start, stop, uh )
print "rate {:6.2f}".format(rate),\
"MiB/s Nfiles {:5d}".format(Nfiles),\
" size {:8.2f}".format(size),\
"GiB", " avg size {:8.2f}".format(avgsize), "MiB", uh
finish()