-
Notifications
You must be signed in to change notification settings - Fork 0
/
ast-diff.lisp
4372 lines (4072 loc) · 183 KB
/
ast-diff.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
;;; ast-diff.lisp --- diffs between ASTs and other tree structures
;;;
;;; The @code{resolve/ast-diff} library provides
;;; for the differencing of software objects. A general tree
;;; differencing algorithm (originally adopted from
;;; @url{http://thume.ca/2017/06/17/tree-diffing/#a-tree-diff-optimizer})
;;; is used to difference tree genomes. Optimizations are made which
;;; assume the common cases of differences typically found in software
;;; development in which most top-level subtrees have no differences.
;;; This library is used to define the @code{ast-diff} and
;;; @code{ast-merge} command-line executables. These executables
;;; provide for AST-level differencing and may be used as a
;;; replacement for common line- or word-based differencing tools
;;; during software development.
;;;
;;; MERGE-DIFF-2 should hold the language-specific logic.
;;;
;;; MELD take conflicts and moves them up to higher levels.
;;;
;;; MERGE-DIFFS-ON-SYMS generic function which dispatches off of
;;; combinations of edit operations. This should fail more frequently
;;; (generate more :CONFLICT nodes). Could write new methods to
;;; override existing methods.
;;;
;;; @texi{ast-diff}
(defpackage :resolve/ast-diff
(:use
:gt/full
:resolve/core
:software-evolution-library
:software-evolution-library/software/parseable
:software-evolution-library/software/simple
:software-evolution-library/software/ir
:resolve/string
:metabang-bind)
(:import-from :fare-quasiquote)
(:import-from :trivia.fail :fail)
(:shadowing-import-from :software-evolution-library/terminal
:+color-RED+ :+color-GRN+ :+color-RST+)
(:shadowing-import-from :functional-trees
:child-slots :slot-spec-slot
:children-alist :slot-specifier
:slot-specifier-slot)
(:import-from :software-evolution-library/software/tree-sitter
:tree-sitter-ast :output-transformation
:computed-text :structured-text :choice-superclass)
(:local-nicknames
(:range :software-evolution-library/utility/range)
(:ts :software-evolution-library/software/tree-sitter)
(:iter :iterate))
(:export
:ast-cost
:ast-size
:ast-can-recurse
:ast-diff
:ast-diff-elide-same
:ast-patch
:astify
:unastify
:unastify-lisp-diff
:print-diff
;; Merge functions
:chunk
:diff3
:merge3
:converge
:merge-diffs-on-syms
;; Functions needed by alist.lisp
:record-unstable
:merge-diffs2
:merge3
;; Edit tree symbols
:create-edit-tree
:create-and-print-edit-tree
:edit-tree-node-base
:edit-tree-node
:edit-tree-node-script
:edit-tree-node-children
:print-edit-tree-node
:print-edit-tree
:describe-edit-tree-node
:describe-edit-tree
:map-edit-tree
:ast-size
:ast-to-list-form
:*base-cost*
:*max-wrap-diff*
:*wrap*
:*wrap-sequences*
:*diff-strings-p*
#+(or) :*ignore-whitespace*))
(in-package :resolve/ast-diff)
(in-readtable resolve-readtable)
;;; Comments on further algorithm improvements
;;;
;;; The "good enough" algorithm could be made slightly better
;;; by allowing limited lookahead. With k lookahead it could
;;; run in O(k max(m,n)) time, m and n the lengths of the sequences.
;;;
;;; The hash function has not been fully tuned for speed.
;;;
;;; RECURSIVE-DIFF has an UPPER-BOUND argument. This is not used
;;; now, but could be used to speed up the slow part of the algorithm
;;; if we want an exact solution rather than the "good enough" solution.
;;; Use the cost of the "good enough" solution to provide an upper bound
;;; for the exact solution. Also, recursive calls could provide their
;;; own upper bounds based on the optimum path to the node from which
;;; the call is being made.
;;;
;;; The dynamic programming algorithm uses O(mn) space. It can be
;;; changed to use linear space. There are various schemes in the
;;; literature for doing LCS and edit distance in linear space.
;;; A simple one is as follows: note that we can compute the length
;;; of the LCS (or the edit distance) in linear space by scanning
;;; the array keeping just two rows. We cannot reconstruct the
;;; edit from this, but we can record which entry on the m/2 row
;;; was in the final optimal solution. Once we have done that,
;;; the parts before and after that point can be recomputed
;;; recursively.
;;;
;;; The speed on C programs is now dominated by the time needed
;;; for Clang to parse the C and send the AST to Lisp.
;;;
;;; It may be useful to have a hash function on ASTs that produces
;;; smaller integers, and use ast-hash-with-check to handle collisions.
;;; This could be tied in with a general mechanism for hash consing
;;; of ASTs.
(declaim (special *cost-table*))
(deftype edit-action ()
'(member
:bad
:conflict
:recurse :recurse-tail
:insert :insert-sequence
:delete :delete-sequence
:replace
:same :same-tail :same-sequence :same-or-recurse
:wrap :wrap-sequence
:unwrap :unwrap-sequence))
(defparameter *base-cost* 2
"Basic cost of a diff, before adding costs of components.")
(defvar *ignore-whitespace* nil
"If true, inserting or removing whitespace in a string has zero cost")
(declaim (boolean *diff-strings-p*))
(defvar *diff-strings-p* t
"If true, descend into strings when computing diffs.")
(declaim (boolean *wrap* *wrap-sequences*))
(defvar *wrap* nil
"If true, perform wrap/unwrap actions in diffs.")
(defvar *wrap-sequences* nil
"If true, perform wrap-sequence/unwrap-sequence actions in diffs.")
(defvar *max-wrap-diff* 500
"When *wrap* is true, this is the maximum size difference for
wrapping and unwrapping to be considered.")
(defun clength (x)
"Compute the length of X in conses."
(iter (while (consp x)) (pop x) (summing 1)))
(defgeneric ast-class (ast)
(:documentation "Return the class of AST as a symbol.
For a real AST, this is just its class name, but it can be other
symbols for other types.")
(:method ((x ast))
(class-name-of x)))
(defun cons-cost (x)
"Compute the cost of a tree of conses."
(if (not (consp x)) 1
(let ((conses nil))
(let ((y x))
(iter (while (consp y))
(push y conses)
(pop y)))
(let ((cost 1))
(iter (while conses)
(incf cost (cons-cost (car (pop conses)))))
cost))))
;; #+sbcl (declaim (optimize sb-cover:store-coverage-data))
;;; Interface functions.
(defgeneric ast-cost (ast)
(:documentation "Return cost of AST.")
(:method ((ast ast))
(cl:reduce #'+ (children ast) :key #'ast-cost :initial-value 1))
(:method ((ast tree-sitter-ast))
(cl:reduce #'+ (standardized-children ast) :key #'ast-cost :initial-value 1))
;; Slot-specifiers are introduced as markers in the standardized-children
;; lists of tree-sitter-ast nodes, and should not contribute the
;; the cost.
(:method ((ss slot-specifier)) 0)
(:method ((ast t))
1)
(:method ((s string))
(if *ignore-whitespace*
(count-if-not #'whitespacep s)
(length s)))
(:method ((ast vector))
(length ast))
(:method ((ast null))
1)
(:method ((ast cons))
(+ (iter (sum (ast-cost (pop ast)))
(while (consp ast)))
;; cost of terminal NIL is 0
(if ast (ast-cost ast) 0))))
(defgeneric ast-can-recurse (ast-a ast-b)
(:documentation "Check if recursion is possible on AST-A and AST-B. Strings
can be recursed on if `*diff-strings-p*' is true (defaults to true)")
(:method ((ast-a string) (ast-b string))
*diff-strings-p*)
(:method ((ast-a t) (ast-b t))
nil)
(:method ((ast-a ast) (ast-b ast))
t))
(defmethod source-text ((ast cons) &rest args &key stream)
(iter (while (consp ast)) (apply #'source-text (pop ast) args))
(when ast
(write-string "." stream)
(apply #'source-text ast args)))
(defmethod source-text ((ast slot-specifier) &key stream)
(write-sequence "" stream))
(defgeneric ast-to-list-form (ast)
(:documentation "Convert ast into a more readable list form")
(:method ((ast ast))
(cons (ast-class ast)
(mapcar #'ast-to-list-form (children ast))))
(:method (ast) ast))
(defgeneric ast-size (node)
(:documentation "Number of nodes and leaves in an AST or ast-like thing")
(:method ((node ast))
(reduce #'+ (children node) :key #'ast-size :initial-value 1))
(:method ((node t)) 1))
;;; Wrapper for Lisp lists in simple ASTs
;;; This is so we don't need to confuse the ast-diff machinery
;;; by making it handle raw lisp lists, and so information
;;; like size and cost can be cached in the ast nodes
;;;
;;; Each SIMPLE-LISP-AST is a list (either proper or improper)
;;; The elements of CHILDREN are the proper elements of the list,
;;; and either the tail value (if the list is improper) or :NIL
;;; (if the list is proper). This will collide on improper lists
;;; that end in :NIL, but there is nothing special about :NIL, so fix
;;; this up later.
(defclass simple-lisp-ast (functional-tree-ast)
((children :initarg :children :initform nil)
(child-slots :initform '(children) :allocation :class)
;; Slots for caching
(cost-cache :initarg :cost-cache
:initform nil
:accessor cost-cache)
(size-cache :initarg :size-cache
:initform nil
:accessor size-cache)
(unastify-cache :initarg :unastify-cache
:initform nil
:accessor unastify-cache)))
(defmethod ast-class ((ast simple-lisp-ast)) :list)
(defmethod source-text ((ast simple-lisp-ast) &key stream)
(let ((v (unastify ast)))
(if v
(format stream "~a" (unastify v))
(write-string "()" stream))))
(defmethod equal? ((a simple-lisp-ast) (b simple-lisp-ast))
(equalp (unastify a) (unastify b)))
(defmethod ast-cost :around ((ast simple-lisp-ast))
(ensure (cost-cache ast) (call-next-method)))
(defmethod ast-size :around ((ast simple-lisp-ast))
(ensure (size-cache ast) (call-next-method)))
(defmethod copy :around ((obj simple-lisp-ast) &rest args
&key &allow-other-keys)
(apply #'call-next-method obj
;; Don't copy the cache slots.
:cost-cache nil :size-cache nil :unastify-cache nil
args))
(defgeneric astify (x)
(:documentation "Convert a Lisp data structure to a SIMPLE-LISP-AST")
(:method ((x software))
(genome x))
(:method ((x simple))
(lines x)))
(defgeneric unastify (x)
(:documentation "Convert a SIMPLE-LISP-AST to a Lisp data structure"))
(def +end-marker+ :nil
"Value used to mark the end of a proper list, as AST-DIFF treats NIL
as a list.")
(defmethod source-text ((x (eql +end-marker+)) &rest args &key)
(apply #'source-text "" args))
(defmethod astify ((x list))
(if (proper-list-p x)
;; Add an end marker to represent the NIL
;; (because AST-DIFF treats NIL as a list)
(make-instance 'simple-lisp-ast
:children (nconc (mapcar #'astify x) (list +end-marker+))
:unastify-cache x)
;; Properize the list
(let ((original-x x)
(properized-x
(iter (collecting
(if (consp x)
(car x)
(progn
(assert (not (eql x +end-marker+)) ()
"End marker ~s found"
x)
x)))
(while (consp x))
(pop x))))
(make-instance 'simple-lisp-ast
:children (mapcar #'astify properized-x)
:unastify-cache original-x))))
(defmethod astify (x) x)
(defmethod unastify ((ast simple-lisp-ast))
(or (unastify-cache ast)
(unastify-list (children ast))))
(defmethod unastify (val) val)
(defun unastify-list (c)
(and c
(let ((last-c (lastcar c)))
(if (eql last-c +end-marker+)
(mapcar #'unastify (butlast c))
(nconc (mapcar #'unastify (butlast c))
last-c)))))
(defun unastify-lisp-diff (d)
(typecase d
(simple-lisp-ast (unastify d))
(cons
(cons (unastify-lisp-diff (car d))
(unastify-lisp-diff (cdr d))))
(t d)))
(defmethod print-object ((obj simple-lisp-ast) stream)
(if *print-readably*
(call-next-method)
(print-unreadable-object (obj stream :type t)
(format stream ":VALUE ~s" (unastify obj)))))
;;; Classes for "edit trees"
;;;
;;; It's useful to extract an "edit tree" from an edit script.
;;; The edit tree is a tree representation of the parts of the
;;; diff that group the edit, hierarchically, into subedits.
;;;
;;; Each node is a mapping from one edit segment in the original tree
;;; to another edit segment in the target tree. An edit segment
;;; represents a piece of the tree that is being changed by the a part
;;; of the edit script; it is essentially a pointer into the tree with
;;; a target node, an offset, and a length.
(defclass size-mixin ()
((size :reader ast-size
:documentation "Cache slot for AST-SIZE function"))
(:documentation "Mixin to give a class a slot for caching
the AST-SIZE value"))
(defclass edit-segment-common (size-mixin)
((node ;; :type (or ast-node list) ;; Need LIST because we don't just do ASTs
:initarg :node
:accessor edit-segment-node
:documentation "The tree node for which a subset
of the children (possibly empty) form this edit segment")
(start :type (integer 0)
:initarg :start
:accessor edit-segment-start
:documentation "The index (starting at 0) of the first
child in the edit segment"))
(:documentation "Common slots of edit-segment classes"))
(defclass edit-segment (edit-segment-common)
((length :type (integer 0)
:initarg :length
:accessor edit-segment-length
:documentation "The number of children (possibly zero)
in the edit segment"))
(:documentation "An edit-segment points to a subset of the children
of a tree node"))
(defclass string-edit-segment (edit-segment-common)
((string-start :type integer
:initarg :string-start
:accessor string-edit-segment-string-start
:documentation "Start of the substring that is
the AST segment")
(string-length :type integer
:initarg :string-length
:accessor string-edit-segment-string-length
:documentation "Length of the substring that is
the edit segment"))
(:documentation "This is a special case, because there may be edits
that descend into the strings at the leafs of a tree. These need
special representation as edit-segments, recording both the position
of the leaf in the children of its parent in the tree, and the position
of the substring inside the string."))
(defmethod source-text ((segment edit-segment) &rest args &key)
(with-accessors ((start edit-segment-start)
(len edit-segment-length)
(ast-node edit-segment-node))
segment
(mapc (lambda (ast)
(apply #'source-text ast args))
(subseq (standardized-children ast-node) start (+ start len)))))
(defmethod source-text ((segment string-edit-segment) &key stream)
(with-slots (node start string-start string-length)
segment
(assert node)
(write-string (elt (children node) start) stream
:start string-start
:end (+ string-start string-length))))
(defmethod ast-to-list-form ((segment string-edit-segment))
(source-text segment))
(def +list-ellipsis+ '(|...|))
(defmethod ast-to-list-form ((segment edit-segment))
(with-accessors ((start edit-segment-start)
(len edit-segment-length)
(ast-node edit-segment-node))
segment
(let ((children (standardized-children ast-node)))
`(,(ast-class ast-node)
,@(unless (eql start 0) +list-ellipsis+)
,@(mapcar #'ast-to-list-form
(subseq children
start (+ start len)))
,@(unless (eql (+ start len)
(length children))
+list-ellipsis+)))))
(defclass edit-tree-node-base (size-mixin)
((script :type list
:initarg :script
:accessor edit-tree-node-script
:documentation "The part of the edit script that
applies to this part of the diff")
(children :type list ;; of edit-tree-node objects
:initarg :children
:initform nil
:accessor edit-tree-node-children
:documentation "Children, in left-to-right order
in the source tree, of this edit-tree-node"))
(:documentation "Base class for edit tree nodes"))
(defclass edit-tree-node (edit-tree-node-base)
((source :type edit-segment-common
:initarg :source
:accessor edit-tree-node-source
:documentation "Segment in the source tree being
rewritten by part of the edit script")
(target :type edit-segment-common
:initarg :target
:accessor edit-tree-node-target
:documentation "Segment in the target tree being
rewritten TO by part of the edit script"))
(:documentation " "))
(defmethod print-object ((node edit-tree-node) stream)
(print-unreadable-object (node stream :type t :identity nil)
(format stream "~s"
(ellipsize (source-text (edit-tree-node-source node)) 30))))
(defmethod slot-unbound (class (node edit-tree-node) (slot (eql 'size)))
(declare (ignorable class))
(setf (slot-value node slot)
(reduce #'+ (edit-tree-node-children node)
:key #'ast-size :initial-value 1)))
(defmethod ast-size ((segment string-edit-segment)) 1)
;; Cache for SIZE slot, accessed by ast-size
(defmethod slot-unbound (c (segment edit-segment) (slot (eql 'size)))
(declare (ignorable c))
(nest
(with-accessors ((node edit-segment-node)
(length edit-segment-length)
(start edit-segment-start))
segment)
(let* ((children (subseq (standardized-children node) start (+ start length)))
(value (reduce #'+ children :key #'ast-size :initial-value 1)))
(setf (slot-value segment slot) value))))
;;; Main interface to calculating ast differences.
(defgeneric ast-diff* (ast-a ast-b)
(:documentation
"Return a least-cost edit script which transforms AST-A into AST-B.
Also return a second value indicating the cost of the edit.
See `ast-patch' for more details on edit scripts.
The following generic functions may be specialized to configure
differencing of specialized AST structures.; `equal?',
`ast-cost' and `ast-can-recurse'."))
;; The cache maps key pairs to values and counts
(def +ast-diff-cache+ (make-hash-table :size 1021))
(def +ast-diff-counter+ 0)
(declaim (type (integer 0 2000000000) +ast-diff-counter+))
(def +hash-upper-limit+ 100000000)
(defparameter *bucket-size* 20)
(defmethod ast-diff* :around (ast-a ast-b)
(let* ((key (cons ast-a ast-b))
(hash (ast-hash key))
;; val-alist maps keys to (val . count) pairs
(val-alist (gethash hash +ast-diff-cache+))
(pair (assoc key val-alist :test #'equal)))
(cond
;; There is already a cached value.
(pair
(let ((vals (cadr pair)))
(values (car vals) (cdr vals))))
((>= (length val-alist) *bucket-size*)
;; If a bucket gets too big, just stop caching
;; things that map there
(call-next-method))
(t
(flet ((consolidate-insert-delete-pairs (diff)
;; Check for insert, delete pairs
;; When we find one, replace with a :REPLACE edit
(let* ((changed nil)
(e diff)
(new-diff
(iter (while e)
(cond
((and (consp (car e))
(consp (cadr e))
(eql (caar e) :insert)
(eql (caadr e) :delete))
(setf changed t)
(collecting
`(:replace ,(cdr (cadr e)) ,(cdr (car e))))
(pop e) (pop e))
(t (collecting (pop e)))))))
(if changed new-diff diff))))
(mvlet* ((diff cost (call-next-method))
(new-diff (consolidate-insert-delete-pairs diff))
(cost (if (eq new-diff diff) cost
(diff-cost new-diff)))
(diff new-diff))
;; Thin the cache if necessary.
(when (>= +ast-diff-counter+ +hash-upper-limit+)
(setf +ast-diff-counter+
(thin-ast-diff-table +ast-diff-cache+ +ast-diff-counter+)))
(assert (< +ast-diff-counter+ +hash-upper-limit+))
;; Push the diff and cost to a hash bucket.
(push (list* key (cons diff cost) +ast-diff-counter+)
(gethash hash +ast-diff-cache+))
(incf +ast-diff-counter+)
(values diff cost)))))))
(defun thin-ast-diff-table (cache counter)
"Thin out the AST diff cache when it gets too big by dropping the
oldest half of the entries."
(assert (>= counter +hash-upper-limit+))
(let* ((h2 (ash +hash-upper-limit+ -1))
(d (- counter h2)))
(maphash (lambda (k v)
(let* ((head (cons nil v))
(p head)
(n (cdr p)))
(loop
(unless n (return))
;; The entries are lists of the form (KEY (DIFF .
;; COST) . SAVED_COUNTER), where SAVED_COUNTER is
;; the value of `ast-diff-counter' when the diff
;; and cost were cached. If the saved counter is
;; less than H2, we drop the entry; otherwise we
;; decrement the saved counter so the entry will be
;; in the older half the next time we thin the
;; table.
(symbol-macrolet ((saved-counter (cddar n)))
(if (< saved-counter d)
(setf (cdr p) (cdr n))
(progn
(decf saved-counter d)
(shiftf p n (cdr n))))))
(unless (eql (cdr head) v)
(setf (gethash k cache) (cdr head)))))
cache)
h2))
(defun clear-ast-diff-table ()
(setf +ast-diff-counter+ 0)
(clrhash +ast-diff-cache+))
(defun ast-diff (ast-a ast-b
&key
((:ignore-whitespace *ignore-whitespace*)
*ignore-whitespace*)
((:strings *diff-strings-p*) *diff-strings-p*)
((:wrap-sequences *wrap-sequences*) *wrap-sequences*)
((:wrap *wrap*) (or *wrap* *wrap-sequences*))
((:max-wrap-diff *max-wrap-diff*) *max-wrap-diff*)
((:base-cost *base-cost*) *base-cost*)
&allow-other-keys)
;; Convert raw lisp data to asts
(let ((ast-a (astify ast-a))
(ast-b (astify ast-b)))
;; Bag computation to accelerate wrapping/unwrapping will go here
(unwind-protect
(ast-diff* ast-a ast-b)
(clear-ast-diff-table))))
(defgeneric same-class-p (object-a object-b)
(:documentation "Return T if OBJECT-1 and OBJECT-2 are of the same class.")
(:method ((ast-a ast) (ast-b ast)) (eql (ast-class ast-a) (ast-class ast-b)))
(:method ((ast-a structured-text) (ast-b structured-text))
(or (eql (ast-class ast-a) (ast-class ast-b))
;; NOTE: choice expansion subclassing allows for an AST
;; with multiple possible representations/choices
;; to have each possibility represented as its own
;; subclass. In these cases, it's the superclass
;; that really matters.
(and (slot-exists-p ast-a 'choice-superclass)
(slot-exists-p ast-b 'choice-superclass)
(eql (choice-superclass ast-a) (choice-superclass ast-b))))))
(defmethod ast-diff* ((ast-a ast) (ast-b ast))
#+debug (format t "ast-diff[AST] AST-CAN-RECURSE: ~S~%"
(ast-can-recurse ast-a ast-b))
(multiple-value-bind (diff cost)
;; Initial result from diffing the standardized children.
(when (same-class-p ast-a ast-b)
(ast-diff*-lists (standardized-children ast-a)
(standardized-children ast-b)
ast-a ast-b))
(when *wrap*
;; If wrapping makes the diff cheaper, use it.
(multiple-value-bind (wrap-diff wrap-cost)
(ast-diff-wrap ast-a ast-b)
(when (and wrap-cost (or (null cost) (< wrap-cost cost)))
(setf diff wrap-diff
cost wrap-cost)))
;; If unwrapping makes the diff (even?) cheaper, use it.
(multiple-value-bind (unwrap-diff unwrap-cost)
(ast-diff-unwrap ast-a ast-b)
(when (and unwrap-cost (or (null cost) (< unwrap-cost cost)))
(setf diff unwrap-diff
cost unwrap-cost))))
;; At this point we have the cheapest of three possible diffs, or
;; none.
(if diff
(values diff cost)
(call-next-method))))
(defun map-ast-while/path (ast fn &optional path)
"Apply FN to the nodes of AST A, stopping
the descent when FN returns NIL. FN is also passed a PATH
argument, which is a list (in reverse order) of the indices
of children leading down to the node."
(when (funcall fn ast path)
(iter (for c in (children ast))
(for i from 0)
(when (typep c 'ast) (map-ast-while/path c fn (cons i path))))))
(defgeneric ast-diff-wrap (ast-a ast-b &key skip-root first-ast-child)
(:documentation
"Find a minimum cost 'wrap' edit, which wraps an AST in a larger AST."))
(defmethod ast-diff-wrap ((ast-a ast) (ast-b ast)
&key (skip-root t) first-ast-child
&aux (max-wrap-diff *max-wrap-diff*))
;; search over the ASTs under ast-b that are the same class as ast-a,
;; and for which the size difference is not too large
(let* ((ast-a-cost (ast-cost ast-a))
(a-class (ast-class ast-a))
(max-cost (+ ast-a-cost max-wrap-diff))
(min-cost (- ast-a-cost max-wrap-diff))
(best-candidate nil)
(best-cost most-positive-fixnum)
;; Do not also search for wraps in the recursive calls
#+(or) (*wrap* nil))
(when (integerp first-ast-child)
(setf first-ast-child (elt (children ast-a) first-ast-child)))
#+ast-diff-wrap-debug (format t "(ast-class ast-a) = ~S~%" a-class)
(nest
(map-ast-while/path ast-b)
(lambda (x path) #+ast-diff-wrap-debug (format t "Path = ~A~%" path))
(if (and (null path) skip-root) t)
(let ((x-cost (ast-cost x))))
(cond
;; If X is too small, stop search down into it
((< x-cost min-cost)
#+ast-diff-wrap-debug (format t "~a is too small~%" x)
nil)
;; If X is too large, skip it but keep searching
((> x-cost max-cost)
#+ast-diff-wrap-debug (format t "~a is too big~%" x)
t)
;; If X is not the right class, also skip it
((not (eql (ast-class x) a-class))
#+ast-diff-wrap-debug (format t "~a is wrong class~%" x)
t)
;; If the first AST child is not found in the list of children
;; with a "good" match, skip
((and first-ast-child (not (ast-child-check first-ast-child x)))
t))
;; Only if the size is in the right range, and the
;; ast-class matches, do we try to insert here
(t) (multiple-value-bind (diff cost) (ast-diff* ast-a x))
(when (< cost best-cost))
(multiple-value-bind (left-wrap right-wrap classes)
(wraps-of-path ast-b (reverse path)))
(let ((total-cost (+ cost
(cost-of-wrap left-wrap)
(cost-of-wrap right-wrap))))
(when (< total-cost best-cost)
#+ast-diff-wrap-debug
(progn
(format t "Wrap candidate found~%")
(format t "Cost = ~a~%" total-cost)
(format t "ast-a = ~s~%" (ast-to-list-form ast-a))
(format t "ast-b = ~s~%" (ast-to-list-form ast-b))
(format t "diff = ~s~%" diff)
(format t "path = ~s~%" path)
(format t "left-wrap = ~s~%" left-wrap)
(format t "right-wrap = ~s~%" right-wrap)
(format t "classes = ~s~%" classes))
(setf best-cost total-cost
best-candidate (list :wrap diff path left-wrap right-wrap classes
ast-b)))))
(when best-candidate
(values best-candidate best-cost))))
(defgeneric ast-diff-unwrap (ast-a ast-b &key skip-root)
(:documentation "Find a minimum cost 'unwrap' edit, which pulls a subast
out of one tree and turns it into another."))
(defmethod ast-diff-unwrap ((ast-a ast) (ast-b ast)
&key (skip-root t) first-ast-child
&aux (max-wrap-diff *max-wrap-diff*))
;; search over the ASTs under ast-a that are the same class as ast-b,
;; and for which the size difference is not too large
(let* ((ast-b-cost (ast-cost ast-b))
(b-class (ast-class ast-b))
(max-cost (+ ast-b-cost max-wrap-diff))
(min-cost (- ast-b-cost max-wrap-diff))
(best-candidate nil)
(best-cost most-positive-fixnum)
;; Do not also search for wraps in the recursive call
(*wrap* nil))
(when (integerp first-ast-child)
(setf first-ast-child (elt (children ast-b) first-ast-child)))
#+ast-diff-unwrap-debug (format t "(ast-class ast-b) = ~S~%" b-class)
(nest
(map-ast-while/path ast-a)
(lambda (x path) #+ast-diff-unwrap-debug (format t "Path = ~A~%" path))
(if (and (null path) skip-root) t)
(let ((x-cost (ast-cost x))))
(cond
;; If X is too small, stop search down into it
((< x-cost min-cost)
#+ast-diff-unwrap-debug (format t "~a is too small~%" x)
nil)
;; If X is too large, skip it but keep searching
((> x-cost max-cost)
#+ast-diff-unwrap-debug (format t "~a is too big~%" x)
t)
;; If X is not the right class, also skip it
((not (eql (ast-class x) b-class))
#+ast-diff-unwrap-debug (format t "~a is wrong class~%" x)
t)
;; If the first AST child is not found in the list of children
;; with a "good" match, skip
((and first-ast-child (not (ast-child-check first-ast-child x t)))
t))
;; Only if the size is in the right range, and the
;; ast-class matches, do we try to insert here
(t)
(multiple-value-bind (diff cost) (ast-diff* x ast-b))
(when (< cost best-cost)
(multiple-value-bind (left-wrap right-wrap)
(wraps-of-path ast-a (reverse path))
(let ((total-cost (+ cost
*base-cost*
(cost-of-wrap left-wrap)
(cost-of-wrap right-wrap))))
(when (< total-cost best-cost)
(let ((new (list :unwrap diff (reverse path) left-wrap right-wrap)))
#+ast-diff-unwrap-debug
(format t "Replace~%~a (~a)~%with~%~a (~a)~%"
best-candidate best-cost
new total-cost)
(setf best-cost total-cost
best-candidate new)))))))
(when best-candidate
(values best-candidate best-cost))))
(defgeneric ast-child-check (a b &optional reverse?)
(:documentation
"Check that A is 'close enough' to some child of b")
(:method ((a ast) (b ast) &optional reverse?)
(let* ((a-cost (ast-cost a))
(cost-limit (floor (* *base-cost* a-cost 1/2))))
(some (lambda (c)
(and (typep c 'ast)
(<= 1/2 (/ a-cost (ast-cost c)) 3/2)
;; REVERSE? is used so we don't compute ast-diffs backwards
;; when checking for UNWRAP
(< (nth-value 1 (if reverse? (ast-diff* c a) (ast-diff* a c))) cost-limit)))
(children b))))
(:method ((a t) (b t) &optional reverse?) (declare (ignore reverse?))
(equal? a b)))
(defun wraps-of-path (ast path)
"Computes lists of children that lie on the left and right sides of a path
down from AST, as well as the classes of the nodes along the path."
;; (format t "Wraps of PATH = ~s in ~s~%" path (ast-to-list-form ast))
(iter (while path)
(assert (typep ast 'ast))
(let ((c (children ast))
(i (pop path)))
(assert (<= 0 i))
(assert (length< i c))
(collect (ast-class ast) into classes)
(multiple-value-bind (left-half right-half)
(halves c i)
(collect left-half into left)
(collect (rest right-half) into right))
(setf ast (elt c i))
(assert (typep ast 'ast)))
(finally
#+ast-diff-debug (format t "Result: ~s ~s ~s~%" left right classes)
(return (values left right classes)))))
(defun cost-of-wrap (wrap &aux (base-cost *base-cost*))
"Computes the sum of the costs of the objects in a wrap"
(reduce #'+ wrap
:initial-value 0
:key (lambda (w) (reduce #'+ w :key #'ast-cost :initial-value base-cost))))
(defmethod ast-diff-wrap ((ast-a t) (ast-b t) &key skip-root first-ast-child)
(declare (ignore skip-root first-ast-child))
nil)
(defmethod ast-diff-unwrap ((ast-a t) (ast-b t) &key skip-root first-ast-child)
(declare (ignore skip-root first-ast-child))
nil)
(-> ast-diff-wrap-sequence (ast sequence ast)
(values (or list (eql :bad))
fixnum))
(defun ast-diff-wrap-sequence (ast-a sub-a ast-b &aux (len (length sub-a)))
(assert (>= len 2))
(let ((sub-ast (copy ast-a :children (coerce sub-a 'list)))
(*wrap-sequences* nil)
(*wrap* nil)
(first-ast-child (position-if {typep _ 'ast} sub-a)))
(let ((diff (ast-diff-wrap sub-ast ast-b :skip-root nil
:first-ast-child first-ast-child)))
(if (consp diff)
(let ((new-diff `(:wrap-sequence ,len ,@(cdr diff))))
(values new-diff (diff-cost new-diff)))
(values :bad most-positive-fixnum)))))
(-> ast-diff-unwrap-sequence ((or ast fixnum) ast sequence)
(values (or list (eql :bad))
fixnum))
(defun ast-diff-unwrap-sequence (a ast-b sub-b)
(let ((len (length sub-b)))
(assert (>= len 2))
(let ((sub-ast (copy ast-b :children (coerce sub-b 'list)))
(*wrap-sequences* nil)
(*wrap* nil)
(first-ast-child (position-if {typep _ 'ast} sub-b)))
(let ((diff (ast-diff-unwrap a sub-ast :skip-root nil
:first-ast-child first-ast-child)))
(if (consp diff)
(let ((new-diff (cons :unwrap-sequence (cdr diff))))
(values new-diff (diff-cost diff)))
(values :bad most-positive-fixnum))))))
#|
(defun ast-diff-wrap-sequence (ast-a sub-a ast-b)
"Search for the best wrap of the children SUB-A of AST-A inside
AST-B. If none are acceptable, return :BAD"
;; Search inside SUB-A for sequences of children of a node of Sort (AST-CLASS AST-A)
;; that meet some criterion
(let* ((len (length sub-a))
(best-candidate nil)
(best-cost most-positive-fixnum))
(assert (>= len 2))
(nest
(let ((a-class (ast-class ast-a)
(limit (- (reduce #'+ sub-a :key #'ast-cost) *max-wrap-diff*))
(*wrap-sequence* nil)
(sub-ast-a (copy ast-a :children (coerce sub-a 'list)))))
(map-ast-while/path ast-b)
(lambda (x path) #+ast-diff-debug (format t "Path = ~A~%" path))
;; Abort descent if the subtree is too small
(if (and (> limit 0) (< (ast-cost x) limit)) nil)
(if (not (eql (ast-class x) a-class)) t)
(multiple-value-bind (diff raw-cost) (ast-diff* sub-ast-a b))
(let ((cost (+ cost prefix-cost postfix-cost))))
(progn
(when (< cost best-cost)
(setf best-cost cost)
(multiple-value-bind (left-wrap rightwrap classes)
(wraps-of-path ast-b (reverse path))
(setf best-candidate
`(:wrap-sequence ,len ,diff ,path ,left-wrap
,right-wrap ,classes ,ast-a))))
t))
;; At this point, we've found the best candidate. Recompute the cost
(values best-candidate (diff-cost best-candidate))))
|#
(defun remove-common-prefix-and-suffix (list-a list-b)
"Return unique portions of LIST-A and LIST-B less shared prefix and postfix.
Prefix and postfix returned as additional values."
;; Just return the input lists immediately if not proper lists.
(unless (and (consp list-a) (consp list-b)
(proper-list-p list-a) (proper-list-p list-b))
(return-from remove-common-prefix-and-suffix
(values list-a list-b nil nil)))
(labels ((test (a b)
(cond ((and (typep a 'ast) (typep b 'ast))
(equal? a b))
((and (not (typep a 'ast)) (not (typep b 'ast)))
(equalp a b))
(t nil)))
(prefix (list-a list-b)
(gcp (list list-a list-b) :test #'test))
(postfix (list-a list-b)
(gcs (list list-a list-b) :test #'test)))
(let* ((prefix (prefix list-a list-b))
(pre-length (length prefix))
(a (drop pre-length list-a))
(b (drop pre-length list-b)))
;; If either list is completely consumed by the prefix, return here.
(if (or (null a) (null b))
(values a b prefix nil)
;; Calculate the postfix (less the prefix) if necessary.
(let* ((postfix (postfix a b))
(post-length (length postfix)))
(values (butlast a post-length)
(butlast b post-length)
prefix
postfix))))))
(defun make-cache (total-a total-b)
(make-array (list (1+ (clength total-a)) (1+ (clength total-b)))
:initial-element nil))
(defstruct rd-node
"Node in the recursive-diff computation graph"
;; Coordinates of the node in the r-d graph
(a 0 :type array-index :read-only t)
(b 0 :type array-index :read-only t)
(in-arcs nil :type list) ;; list of arcs into this node
(out-arcs nil :type list) ;; list of arcs out of this node
(open-pred-count 0 :type (integer 0)) ;; number of predecessors that are still open
(best-in-arc nil) ;; The in arc that gave the lowest cost to this point
(cost 0 :type (integer 0)) ;; total cost to reach this node along best path
)
(defmethod print-object ((node rd-node) stream)
(if *print-readably*
(call-next-method)
(print-unreadable-object (node stream :type t :identity t)
(format stream ":A ~a :B ~a"
(rd-node-a node) (rd-node-b node)))))