-
Notifications
You must be signed in to change notification settings - Fork 52
/
ezr.py
executable file
·830 lines (726 loc) · 26.8 KB
/
ezr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
#!/usr/bin/env python3 -B
# <!-- vim: set ts=2 sw=2 sts=2 et: -->
"""
## Ezr.py
© 2024 Tim Menzies ([email protected]). BSD-2 license
### USAGE:
python3 ezr.py [OPTIONS]
This code explores multi-objective optimization; i.e. what
predicts for the better goal values? This code also explores
active learning; i.e. how to make predictions after looking at
the fewest number of goal values?
### OPTIONS:
-b --buffer int chunk size, when streaming = 100
-L --Last int max number of labels = 30
-c --cut float borderline best:rest = 0.5
-C --Cohen float pragmatically small = 0.35
-e --eg str start up action = mqs
-f --fars int number of times to look far = 20
-h --help show help = False
-k --k int low frequency Bayes hack = 1
-l --label int initial number of labels = 4
-m --m int low frequency Bayes hack = 2
-p --p int distance formula exponent = 2
-s --seed int random number seed = 1234567891
-S --Stop int min size of tree leaves = 30
-t --train str training csv file. row1 has names = data/misc/auto93.csv
### Data File Format
Training data consists of csv files where "?" denotes missing values.
Row one list the columns names, defining the roles of the columns:
- NUMeric column names start with an upper case letter.
- All other columns are SYMbolic.
- Names ending with "+" or "-" are goals to maximize/minimize
- Anything ending in "X" is a column we should ignore.
For example, here is data where the goals are `Lbs-,Acc+,Mpg+`
i.e. we want to minimize car weight and maximize acceleration
and maximize fuel consumption.
Clndrs Volume HpX Model origin Lbs- Acc+ Mpg+
------- ------ --- ----- ------ ---- ---- ----
4 90 48 78 2 1985 21.5 40
4 98 79 76 1 2255 17.7 30
4 98 68 77 3 2045 18.5 30
4 79 67 74 2 2000 16 30
...
4 151 85 78 1 2855 17.6 20
6 168 132 80 3 2910 11.4 30
8 350 165 72 1 4274 12 10
8 304 150 73 1 3672 11.5 10
Note that the top rows are
better than the bottom ones (lighter, faster cars that are
more economical).
"""
# todo: labelling via clustering.
# ## Setting-up
# ### Imports
from __future__ import annotations
from typing import Any as any
from typing import List, Dict, Type, Callable, Generator
from fileinput import FileInput as file_or_stdin
from dataclasses import dataclass, field, fields
import datetime
from math import exp,log,cos,sqrt,pi
import re,sys,ast,math,random,inspect
from time import time
import stats
R = random.random
one = random.choice
#
# ### Types and Classes
#
# Some misc types:
number = float | int #
atom = number | bool | str # and sometimes "?"
row = list[atom]
rows = list[row]
classes = dict[str,rows] # `str` is the class name
def LIST(): return field(default_factory=list)
def DICT(): return field(default_factory=dict)
#
# NUMs and SYMs are both COLumns. All COLumns count `n` (items seen),
# `at` (their column number) and `txt` (column name).
@dataclass
class COL:
n : int = 0
at : int = 0
txt : str = ""
#
# SYMs tracks symbol counts and tracks the `mode` (the most common frequent symbol).
@dataclass
class SYM(COL):
has : dict = DICT()
mode : atom=None
most : int=0
#
# NUMs tracks `lo,hi` seen so far, as well the `mu` (mean) and `sd` (standard deviation),
# using Welford's algorithm.
@dataclass
class NUM(COL):
mu : number = 0
m2 : number = 0
sd : number = 0
lo : number = 1E32
hi : number = -1E32
goal : number = 1
# A minus sign at end of a NUM's name says "this is a column to minimize"
# (all other goals are to be maximizes).
def __post_init__(self:NUM) -> None:
if self.txt and self.txt[-1] == "-": self.goal=0
#
# COLS are a factory that reads some `names` from the first
# row , the creates the appropriate columns.
@dataclass
class COLS:
names: list[str] # column names
all : list[COL] = LIST() # all NUMS and SYMS
x : list[COL] = LIST() # independent COLums
y : list[COL] = LIST() # dependent COLumns
klass: COL = None
# Collect `all` the COLs as well as the dependent/independent `x`,`y` lists.
# Upper case names are NUMerics. Anything ending in `+` or `-` is a goal to
# be maximized of minimized. Anything ending in `X` is ignored.
def __post_init__(self:COLS) -> None:
for at,txt in enumerate(self.names):
a,z = txt[0],txt[-1]
col = (NUM if a.isupper() else SYM)(at=at, txt=txt)
self.all.append(col)
if z != "X":
(self.y if z in "!+-" else self.x).append(col)
if z=="!": self.klass = col
if z=="-": col.goal = 0
#
# DATAs store `rows`, which are summarized in `cols`.
@dataclass
class DATA:
cols : COLS = None # summaries of rows
rows : rows = LIST() # rows
# Another way to create a DATA is to copy the columns structure of
# an existing DATA, then maybe load in some rows to that new DATA.
def clone(self:DATA, rows:rows=[]) -> DATA:
return DATA().add(self.cols.names).adds(rows)
#
# ### Decorators
# I like how JULIA and CLOS lets you define all your data types
# before anything else. Also, you can group together related methods
# from different classes. I think that really simplifies explaining the
# code. So this `of` decorator lets me
# define methods separately to class definition (and, btw, it collects a
# documentation strings).
def of(doc):
def doit(fun):
fun.__doc__ = doc
self = inspect.getfullargspec(fun).annotations['self']
setattr(globals()[self], fun.__name__, fun)
return doit
#
# ## Methods
# ### Misc
#
@of("Return central tendency of a DATA.")
def mid(self:DATA) -> row:
return [col.mid() for col in self.cols.all]
@of("Return central tendency of NUMs.")
def mid(self:NUM) -> number: return self.mu
@of("Return central tendency of SYMs.")
def mid(self:SYM) -> number: return self.mode
@of("Return diversity of a NUM.")
def div(self:NUM) -> number: return self.sd
@of("Return diversity of a SYM.")
def div(self:SYM) -> number: return self.ent()
@of("Returns 0..1 for min..max.")
def norm(self:NUM, x) -> number:
return x if x=="?" else ((x - self.lo) / (self.hi - self.lo + 1E-32))
@of("Entropy = measure of disorder.")
def ent(self:SYM) -> number:
return - sum(n/self.n * log(n/self.n,2) for n in self.has.values())
# ### Add
@of("add COL with many values.")
def adds(self:COL, src) -> COL:
[self.add(row) for row in src]; return self
@of("add DATA with many values.")
def adds(self:DATA, src) -> DATA:
[self.add(row) for row in src]; return self
@of("As a side-effect on adding one row (to `rows`), update the column summaries (in `cols`).")
def add(self:DATA,row:row) -> DATA:
if self.cols: self.rows += [self.cols.add(row)]
else: self.cols = COLS(names=row) # for row q
return self
@of("add all the `x` and `y` cols.")
def add(self:COLS, row:row) -> row:
[col.add(row[col.at]) for cols in [self.x, self.y] for col in cols]
return row
@of("If `x` is known, add this COL.")
def add(self:COL, x:any) -> any:
if x != "?":
self.n += 1
self.add1(x)
@of("add symbol counts.")
def add1(self:SYM, x:any) -> any:
self.has[x] = self.has.get(x,0) + 1
if self.has[x] > self.most: self.mode, self.most = x, self.has[x]
return x
@of("add `mu` and `sd` (and `lo` and `hi`). If `x` is a string, coerce to a number.")
def add1(self:NUM, x:any) -> number:
self.lo = min(x, self.lo)
self.hi = max(x, self.hi)
d = x - self.mu
self.mu += d / self.n
self.m2 += d * (x - self.mu)
self.sd = 0 if self.n <2 else (self.m2/(self.n-1))**.5
#
# ### Guessing
@of("Guess values at same frequency of `has`.")
def guess(self:SYM) -> any:
r = R()
for x,n in self.has.items():
r -= n/self.n
if r <= 0: return x
return self.mode
@of("Guess values with some `mu` and `sd` (using Box-Muller).")
def guess(self:NUM) -> number:
while True:
x1 = 2.0 * R() - 1
x2 = 2.0 * R() - 1
w = x1*x1 + x2*x2
if w < 1:
tmp = self.mu + self.sd * x1 * sqrt((-2*log(w))/w)
return max(self.lo, min(self.hi, tmp))
@of("Guess a row like the other rows in DATA.")
def guess(self:DATA, fun:Callable=None) -> row:
fun = fun or (lambda col: col.guess())
out = ["?" for _ in self.cols.all]
for col in self.cols.x: out[col.at] = fun(col)
return out
@of("Guess a value that is more like `self` than `other`.")
def exploit(self:COL, other:COL, n=20):
n = (self.n + other.n + 2*the.k)
pr1,pr2 = (self.n + the.k) / n, (other.n + the.k) / n
key = lambda x: 2*self.like(x,pr1) - other.like(x,pr2)
def trio():
x=self.guess()
return key(x),self.at,x
return max([trio() for _ in range(n)], key=nth(0))
@of("Guess a row more like `self` than `other`.")
def exploit(self:DATA, other:DATA, top=1000,used=None):
out = ["?" for _ in self.cols.all]
for _,at,x in sorted([coli.exploit(colj) for coli,colj in zip(self.cols.x, other.cols.x)],
reverse=True,key=nth(0))[:top]:
out[at] = x
if used != None:
used[at] = used.get(at,None) or NUM(at=at)
used[at].add(x)
return out
@of("Guess a row in between the rows of `self` and `other`.")
def explore(self:DATA, other:DATA):
out = self.guess()
for coli,colj in zip(self.cols.x, other.cols.x): out[coli.at] = coli.explore(colj)
return out
@of("Guess value on the border between `self` and `other`.")
def explore(self:COL, other:COL, n=20):
n = (self.n + other.n + 2*the.k)
pr1,pr2 = (self.n + the.k) / n, (other.n + the.k) / n
key = lambda x: abs(self.like(x,pr1) - other.like(x,pr2))
return min([self.guess() for _ in range(n)], key=key)
#
# ## Distance
@of("Between two values (Aha's algorithm).")
def dist(self:COL, x:any, y:any) -> float:
return 1 if x==y=="?" else self.dist1(x,y)
@of("Distance between two SYMs.")
def dist1(self:SYM, x:number, y:number) -> float: return x != y
@of("Distance between two NUMs.")
def dist1(self:NUM, x:number, y:number) -> float:
x, y = self.norm(x), self.norm(y)
x = x if x !="?" else (1 if y<0.5 else 0)
y = y if y !="?" else (1 if x<0.5 else 0)
return abs(x-y)
@of("Euclidean distance between two rows.")
def dist(self:DATA, r1:row, r2:row) -> float:
n = sum(c.dist(r1[c.at], r2[c.at])**the.p for c in self.cols.x)
return (n / len(self.cols.x))**(1/the.p)
@of("Sort rows randomly")
def shuffle(self:DATA) -> DATA:
random.shuffle(self.rows)
return self
@of("Sort rows by the Euclidean distance of the goals to heaven.")
def chebyshevs(self:DATA) -> DATA:
self.rows = sorted(self.rows, key=lambda r: self.chebyshev(r))
return self
@of("Compute Chebyshev distance of one row to the best `y` values.")
def chebyshev(self:DATA,row:row) -> number:
return max(abs(col.goal - col.norm(row[col.at])) for col in self.cols.y)
@of("Sort rows by the Euclidean distance of the goals to heaven.")
def d2hs(self:DATA) -> DATA:
self.rows = sorted(self.rows, key=lambda r: self.d2h(r))
return self
@of("Compute euclidean distance of one row to the best `y` values.")
def d2h(self:DATA,row:row) -> number:
d = sum(abs(c.goal - c.norm(row[c.at]))**2 for c in self.cols.y)
return (d/len(self.cols.y)) ** (1/the.p)
#
# ### Nearest Neighbor
@of("Sort `rows` by their distance to `row1`'s x values.")
def neighbors(self:DATA, row1:row, rows:rows=None) -> rows:
return sorted(rows or self.rows, key=lambda row2: self.dist(row1, row2))
@of("Return predictions for `cols` (defaults to klass column).")
def predict(self:DATA, row1:row, rows:rows, cols=None, k=2):
cols = cols or self.cols.y
got = {col.at : [] for col in cols}
for row2 in self.neighbors(row1, rows)[:k]:
d = 1E-32 + self.dist(row1,row2)
[got[col.at].append( (d, row2[col.at]) ) for col in cols]
return {col.at : col.predict( got[col.at] ) for col in cols}
@of("Find weighted sum of numbers (weighted by distance).")
def predict(self:NUM, pairs:list[tuple[float,number]]) -> number:
ws,tmp = 0,0
for d,num in pairs:
w = 1/d**2
ws += w
tmp += w*num
return tmp/ws
@of("Sort symbols by votes (voting by distance).")
def predict(self:SYM, pairs:list[tuple[float,any]]) -> number:
votes = {}
for d,x in pairs:
votes[x] = votes.get(x,0) + 1/d**2
return max(votes, key=votes.get)
#
# ### Cluster
@dataclass
class CLUSTER:
data : DATA
right : row
left : row
mid : row
cut : number
fun : Callable
lvl : int = 0
lefts : CLUSTER = None
rights : CLUSTER = None
def __repr__(self:CLUSTER) -> str:
return f"{'|.. ' * self.lvl}{len(self.data.rows)}"
def leaf(self:CLUSTER, data:DATA, row:row) -> CLUSTER:
d = data.dist(self.left,row)
if self.lefts and self.lefts.fun( d,self.lefts.cut): return self.lefts.leaf(data,row)
if self.rights and self.rights.fun(d,self.rights.cut): return self.rights.leaf(data,row)
return self
def nodes(self:CLUSTER):
def leafp(x): return x.lefts==None or x.rights==None
yield self, leafp(self)
for node in [self.lefts,self.rights]:
if node:
for x,isLeaf in node.nodes(): yield x, isLeaf
@of("Return two distant rows, optionally sorted into best, then rest")
def twoFar(self:DATA, rows:rows, sortp=False, samples:int=None) -> tuple[row,row] :
left, right = max(((one(rows), one(rows)) for _ in range(samples or the.fars)),
key= lambda two: self.dist(*two))
if sortp and self.chebyshev(right) < self.chebyshev(left): right,left = left,right
return left, right
@of("Divide rows by distance to two faraway points")
def half(self:DATA, rows:rows, sortp=False) -> tuple[rows,rows,row,row,float]:
left,right = self.twoFar(rows, sortp=sortp)
cut = self.dist(left,right)/2
lefts,rights = [],[]
for row in rows:
(lefts if self.dist(row,left) <= cut else rights).append(row)
return self.dist(left,lefts[-1]),lefts, rights, left, right
@of("recursive divide rows using distance to two far points")
def cluster(self:DATA, rows:rows=None, sortp=False, stop=None, cut=None, fun=None, lvl=0):
stop = stop or the.Stop
rows = rows or self.rows
cut1, ls, rs, left, right = self.half(rows,sortp=sortp)
it = CLUSTER(data=self.clone(rows), cut=cut, fun=fun, left=left, right=right, mid=rs[0], lvl=lvl)
if len(ls)>stop and len(ls)<len(rows): it.lefts = self.cluster(ls, sortp, stop, cut1, le, lvl+1)
if len(rs)>stop and len(rs)<len(rows): it.rights = self.cluster(rs, sortp, stop, cut1, gt, lvl+1)
return it
le = lambda x,y: x <= y
gt = lambda x,y: x > y
@of("Diversity sampling (one per items).")
def diversity(self:DATA, rows:rows=None, stop=None):
rows = rows or self.rows
cluster = self.cluster(rows, stop=stop or math.floor(len(rows)**0.5))
for node,leafp in cluster.nodes():
if leafp:
yield node.mid
#
# ## Bayes
# of("discretieze.")
# def bin(self:SYM,x): return x
#
# of("discretieze.")
# def bin(self:NUM,x): return math.floor( self.norm(x) * 20 )
#
# of("Return get bins.")
# def bins(self:COL, goal, klasses:classes):
# tmp = {}
# lst = sorted([(r[self.at], y) for y,rows in klasses.items()
# for r in rows if r[self.t] != "?"], key=nth(0))
# for x,y in lst:
# b = self.bin(x)
# tmp[b] = tmp.get(b,None) or SYM(at=self.at)
# tmp[b].add(y)
# return self.bins1(tmp, goal,len(lst))
#
# def bins1(self:SYM, tmp, goal, n):
# return max(tmp, key=lambda sym: sym.power(goal,n))
#
# def bins1(self.NUM, tmp,goal, n):
#
# klasses lst = [out[b] for b in out.keys.sorted()]
# for i,sym in enumerate(lst);
# if i > 0 : sym.last = lst[i-1]
# if i < length(lst) -1 : sym.next = lst[i+1]
# sorted(lst, key=lambda sym: sym.has.powerful
# return most, out, len(lst)
#
# of("Return useful symbolic range.")
# def powerful(self:SYM, goal, klasses:classes):
# most ,_, __ = self.bins(goal, klasses)
# return most
#
# of("Return useful numeric range.")
# def powerful(self:NUM, goal, klasses:classes):
# most, out, all = self.bins(goal, klasses)
# return max(bins, max=lambda sym: sym.has.power(goal,all))
#
# of("Return useful ranges.")
# def power(self:SYM,goal,all):
# b,r = 0,0
# for k,n in self.has.items():
# if k==goal: b += n/all
# else : r += n/all
# return b*b/(b + r)
#
#
# def cdf(klasses:classes, x:Callable):
# lo,hi = lst[0][0], lst[-1][0]
# symp = not isinstance(lo,(inf,float))
# for x,y in lst
# b = x if symp else floor(((x-lo)/(hi-lo+1-32) *20))
# bin[b] = bin.get(b,None) or SYM(at=b)
# bin[b].add(y)
# if nump: return max(bin,key=
@of("How much DATA likes a `row`.")
def loglike(self:DATA, r:row, nall:int, nh:int) -> float:
prior = (len(self.rows) + the.k) / (nall + the.k*nh)
likes = [c.like(r[c.at], prior) for c in self.cols.x if r[c.at] != "?"]
return sum(log(x) for x in likes + [prior] if x>0)
@of("How much a SYM likes a value `x`.")
def like(self:SYM, x:any, prior:float) -> float:
return (self.has.get(x,0) + the.m*prior) / (self.n + the.m)
@of("How much a NUM likes a value `x`.")
def like(self:NUM, x:number, _) -> float:
v = self.sd**2 + 1E-30
nom = exp(-1*(x - self.mu)**2/(2*v)) + 1E-30
denom = (2*pi*v) **0.5
return min(1, nom/(denom + 1E-30))
#
# ### Active Learning
@of("active learning")
def activeLearning(self:DATA, score=lambda B,R: B-R, generate=None, faster=True ):
def ranked(rows): return self.clone(rows).chebyshevs().rows
def todos(todo):
if faster: # Apply our sorting heuristics to just a small buffer at start of "todo"
# rotate back half of buffer to end of list, fill the gap with later items
n = the.buffer//2
return todo[:n] + todo[2*n: 3*n], todo[3*n:] + todo[n:2*n]
else: # Apply our sorting heuristics to all of todo.
return todo,[]
def guess(todo:rows, done:rows) -> rows:
cut = int(.5 + len(done) ** the.cut)
best = self.clone(done[:cut])
rest = self.clone(done[cut:])
a,b = todos(todo)
if generate:
return self.neighbors(generate(best,rest), a) + b
else:
key = lambda r: score(best.loglike(r, len(done), 2), rest.loglike(r, len(done), 2))
return sorted(a, key=key, reverse=True) + b
def loop(todo:rows, done:rows) -> rows:
for k in range(the.Last - the.label):
if len(todo) < 3 : break
top,*todo = guess(todo, done)
done += [top]
done = ranked(done)
return done
return loop(self.rows[the.label:], ranked(self.rows[:the.label]))
#
# ## Utils
# ### One-Liners
# non parametric mid and div
def medianSd(a: list[number]) -> tuple[number,number]:
a = sorted(a)
return a[int(0.5*len(a))], (a[int(0.9*len(a))] - a[int(0.1*len(a))])
# Return a function that returns the `n`-th idem.
def nth(n): return lambda a:a[n]
# Rounding off
def r2(x): return round(x,2)
def r3(x): return round(x,3)
# Pring to standard error
def dot(s="."): print(s, file=sys.stderr, flush=True, end="")
# Timing
def timing(fun) -> number:
start = time()
fun()
return time() - start
# M-by-N cross val
def xval(lst:list, m:int=5, n:int=5, some:int=10**6) -> Generator[rows,rows]:
for _ in range(m):
random.shuffle(lst)
for n1 in range (n):
lo = len(lst)/n * n1
hi = len(lst)/n * (n1+1)
train, test = [],[]
for i,x in enumerate(lst):
(test if i >= lo and i < hi else train).append(x)
train = random.choices(train, k=min(len(train),some))
yield train,test
# ### Strings to Things
def coerce(s:str) -> atom:
"Coerces strings to atoms."
try: return ast.literal_eval(s)
except Exception: return s
def csv(file) -> Generator[row]:
infile = sys.stdin if file=="-" else open(file)
with infile as src:
for line in src:
line = re.sub(r'([\n\t\r ]|#.*)', '', line)
if line: yield [coerce(s.strip()) for s in line.split(",")]
# ### Settings and CLI
class SETTINGS:
def __init__(self,s:str) -> None:
"Make one slot for any line `--slot ... = value`"
self._help = s
want = r"\n\s*-\w+\s*--(\w+).*=\s*(\S+)"
for m in re.finditer(want,s): self.__dict__[m[1]] = coerce(m[2])
self.sideEffects()
def __repr__(self) -> str:
"hide secret slots (those starting with '_'"
return str({k:v for k,v in self.__dict__.items() if k[0] != "_"})
def cli(self):
"Update slots from command-line"
d = self.__dict__
for k,v in d.items():
v = str(v)
for c,arg in enumerate(sys.argv):
after = sys.argv[c+1] if c < len(sys.argv) - 1 else ""
if arg in ["-"+k[0], "--"+k]:
d[k] = coerce("False" if v=="True" else ("True" if v=="False" else after))
self.sideEffects()
def sideEffects(self):
"Run side-effects."
d = self.__dict__
random.seed(d.get("seed",1))
if d.get("help",False):
sys.exit(print(self._help))
#
# ## Tests
class egs:
def all():
for s in dir(egs):
if s[0] != "_" and s != "all":
print(s)
random.seed(the.seed)
getattr(egs,s)()
def nums():
r = 256
n1 = NUM().adds([R()**2 for _ in range(r)])
n2 = NUM().adds([n1.guess() for _ in range(r)])
assert abs(n1.mu - n2.mu) < 0.05, "nums mu?"
assert abs(n1.sd - n2.sd) < 0.05, "nums sd?"
def syms():
r = 256
n1 = SYM().adds("aaaabbc")
n2 = SYM().adds(n1.guess() for _ in range(r))
assert abs(n1.mode == n2.mode), "syms mu?"
assert abs(n1.ent() - n2.ent()) < 0.05, "syms ent?"
def csvs():
d = DATA()
n=0
for row in csv(the.train): n += len(row)
assert n== 3192,"csv?"
def reads():
d = DATA().adds(csv(the.train))
assert d.cols.y[1].n==398,"reads?"
def likings():
d = DATA().adds(csv(the.train)).chebyshevs()
random.shuffle(d.rows)
lst = sorted( round(d.loglike(row,2000,2),2) for row in d.rows[:100])
print(lst)
def order():
for i, row in enumerate( DATA().adds(csv(the.train)).chebyshevs().rows ):
if i % 30 ==0 :print(f"{row}")
def chebys():
d = DATA().adds(csv(the.train))
random.shuffle(d.rows)
lst = d.chebyshevs().rows
mid = len(lst)//2
good,bad = lst[:mid], lst[mid:]
dgood,dbad = d.clone(good), d.clone(bad)
lgood,lbad = dgood.loglike(bad[-1], len(lst),2), dbad.loglike(bad[-1], len(lst),2)
assert lgood < lbad, "chebyshev?"
def guesses():
d = DATA().adds(csv(the.train))
random.shuffle(d.rows)
lst = d.chebyshevs().rows
mid = len(lst)//2
good,bad = lst[:mid], lst[mid:]
dgood,dbad = d.clone(good), d.clone(bad)
print(good[0])
print(bad[-1])
print("exploit",dgood.exploit(dbad,top=2))
print("exploit",dbad.exploit(dgood,top=2))
def clones():
d1 = DATA().adds(csv(the.train))
d2 = d1.clone(d1.rows)
for a,b in zip(d1.cols.y, d2.cols.y):
for k,v1 in a.__dict__.items():
assert v1 == b.__dict__[k],"clone?"
def heavens():
d = DATA().adds(csv(the.train)).d2hs()
lst = [row for i,row in enumerate(d.rows) if i % 30 ==0]
assert d.d2h(d.rows[0]) < d.d2h(d.rows[-1]), "d2h?"
def distances():
d = DATA().adds(csv(the.train))
random.shuffle(d.rows)
lst = sorted( round(d.dist(d.rows[0], row),2) for row in d.rows[:100])
for x in lst: assert 0 <= x <= 1, "dists1?"
assert .33 <= lst[len(lst)//2] <= .66, "dists2?"
def twoFar():
d = DATA().adds(csv(the.train))
for _ in range(100):
a,b = d.twoFar(d.rows, sortp=True)
assert d.chebyshev(a) <= d.chebyshev(b), "twoFar?"
for _ in range(100):
cut,ls,rs,l,r = d.half(d.rows)
print(len(ls),len(rs))
def clusters():
d = DATA().adds(csv(the.train))
cluster = d.cluster(d.rows,sortp=True)
for node,leafp in cluster.nodes():
print(r2(d.chebyshev(node.left)) if node.left else "", node,sep="\t")
def diversities(d=None):
d = d or DATA().adds(csv(the.train))
#leafs = random.choices(leafs, k=min(50, len(leafs)))
print(d.chebyshev(d.clone([row for row in d.diversity(stop=10)]).chebyshevs().rows[0]))
#print(len([d.clone([row for row in d.diversity(stop=stop)]).chebyshevs().rows[0] for _ in range(20)]))
def clusters2():
d = DATA().adds(csv(the.train))
somes = []
mids = stats.SOME(txt="mid")
somes += [mids]
for k in [1,2,3,5]:
ks = stats.SOME(txt=f"k{k}")
somes += [ks]
for train,test in xval(d.rows):
cluster = d.cluster(train)
for want in test:
leaf = cluster.leaf(d, want)
rows = leaf.data.rows
got = d.predict(want, rows, k=k)
mid = leaf.data.mid()
for at,got1 in got.items():
sd = d.cols.all[at].div()
mids.add((want[at] - mid[at])/sd)
ks.add( (want[at] - got1 )/sd)
stats.report(somes)
def predicts(file=None):
d = DATA().adds(csv(file or the.train)).shuffle()
tests, train = d.rows[:10], d.rows[10:]
for test in tests:
for at, got in d.predict(test, train, cols=d.cols.y, k=5).items():
want = test[at]
print(at, r3(abs(got - want)/d.cols.all[at].div()))
def _MQS():
for i,arg in enumerate(sys.argv):
if arg[-4:] == ".csv":
the.train=arg
random.seed(the.seed)
egs._mqs()
def _mqs():
print(the.train, flush=True, file=sys.stderr)
print("\n"+the.train)
repeats = 20
d = DATA().adds(csv(the.train))
b4 = [d.chebyshev(row) for row in d.rows]
asIs,div = medianSd(b4)
rnd = lambda z: z
print(f"asIs\t: {asIs:.3f}")
print(f"div\t: {div:.3f}")
print(f"rows\t: {len(d.rows)}")
print(f"xcols\t: {len(d.cols.x)}")
print(f"ycols\t: {len(d.cols.y)}\n")
somes = [stats.SOME(b4,f"asIs,{len(d.rows)}")]
for n in [20,25,30,50,100]:
the.Last = n
rand = []
for _ in range(repeats):
some = d.shuffle().rows[:n]
d1 = d.clone().adds(some).chebyshevs()
rand += [rnd(d.chebyshev(d1.rows[0]))]
start = time()
pool = [rnd(d.chebyshev(d.shuffle().activeLearning()[0]))
for _ in range(repeats)]
print(f"pool.{n}: {(time() - start) /repeats:.2f} secs")
generate1 =lambda best,rest: best.exploit(rest,1000)
start = time()
mqs1000 = [rnd(d.chebyshev(d.shuffle().activeLearning(generate=generate1)[0]))
for _ in range(repeats)]
print(f"mqs1K.{n}: {(time() - start)/repeats:.2f} secs")
used={}
generate2 =lambda best,rest: best.exploit(rest,top=4,used=used)
start = time()
mqs4 = [rnd(d.chebyshev(d.shuffle().activeLearning(generate=generate2)[0]))
for _ in range(20)]
print(f"mqs4.{n}: {(time() - start)/repeats:.2f} secs")
somes += [stats.SOME(rand, f"random,{n}"),
stats.SOME(pool, f"pool,{n}"),
stats.SOME(mqs4, f"mqs4,{n}"),
stats.SOME(mqs1000, f"mqs1000,{n}")]
stats.report(somes, 0.01)
#
# ## Main
the = SETTINGS(__doc__)
if __name__ == "__main__" and len(sys.argv)> 1:
the.cli()
random.seed(the.seed)
getattr(egs, the.eg, lambda : print(f"ezr: [{the.eg}] unknown."))()