From c8d55e1a452448fa3a4379fd73cb1fc06284b1ea Mon Sep 17 00:00:00 2001
From: James Kukucka <jkukucka@gmu.edu>
Date: Mon, 24 Jan 2022 10:39:43 -0500
Subject: [PATCH] Updating de-dup script and instructions

---
 README.md         |  3 ++-
 scripts/unique.py | 49 +++++++++++++++++++++++++++++++----------------
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 6532387..f569cae 100644
--- a/README.md
+++ b/README.md
@@ -111,9 +111,10 @@ However, some manual analysis is still needed, as a shortcoming of a stack analy
 
 Before running the de-duplication script, ensure that you have Python 3 installed on your machine. 
 You may access the tarball of failures from the CONFETTI experiments by downloading them from the following URL: **TODO URL**.
+Firstly, extract the tarball.
 Afterwards, you may perform the de-duplication by running `scripts/unique.py` as follows
 
-`python3 scripts/unique.py /path/to/failures.tgz`
+`python3 scripts/unique.py /path/to/failures/directory`
 
 This will create a directory within the `scripts/` directory called `bugs`. 
 The failures within the tarball will be de-duplicated and the `bugs` directory will create a directory hierarchy corresponding to the target+fuzzer, the bug class, and the trials which found that bug. 
diff --git a/scripts/unique.py b/scripts/unique.py
index 950eb71..2f94f04 100755
--- a/scripts/unique.py
+++ b/scripts/unique.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3
 
-# Command line arguments:  tar.gz files
+# Command line arguments:  Directory containing CONFETTI result tarballs
 
 import sys
 import tarfile
@@ -18,7 +18,7 @@
             "bcelgen-knarr-z3",
             "bcelgen-jqf",
             "bcelgen-knarr-z3-no-global-hint",
-            "closure-knarr-z3",
+            "closure-knarr-z3"
             "closure-jqf",
             "closure-knarr-z3-no-global-hint",
             "maven-knarr-z3",
@@ -36,11 +36,17 @@
 
 shutil.rmtree(outputdir,True)
 
-for fname in sys.argv[1:]:
+fnames =[]
+for project in projects:
+    for i in range(1,21):
+        fnames.append(os.path.join(sys.argv[1], "%s-%d.tgz" % (project, i)))
+
+
+
+for fname in fnames:
     print(fname)
     with tarfile.open(fname) as tgz:
         for project in projects:
-            bugs = {}
             for tgzfile in tgz.getmembers():
                 # Is this file in the archive a fail?
                 if not failregex.match(tgzfile.name):
@@ -48,8 +54,14 @@
                 if not project in tgzfile.name:
                     continue
                 elif project in tgzfile.name:
-                    if "-no-global-hint" in tgzfile.name and "no-global-hint" not in project:
+                    if "-no-global-hint" in tgzfile.name and "-no-global-hint" not in project:
                         continue
+                    if "-no-global-hint" in project and "-no-global-hint" not in tgzfile.name:
+                        continue
+
+                if project not in bugs:
+                    bugs[project] = {}
+
                 #print(tgzfile.name)
                 with tgz.extractfile(tgzfile.name) as f:
                     # hash the contents of the fail trace
@@ -68,12 +80,13 @@
                     cwd = os.path.join(outputdir,project,md5,"")
 
                     # use hash to reason about the uniqueness of the fail
-                    if md5 in bugs:
-                        b = bugs[md5]
+                    if md5 in bugs[project]:
+                        b = bugs[project][md5]
                     else:
                         b = []
-                        bugs[md5] = b
-                        os.makedirs(cwd)
+                        bugs[project][md5] = b
+                        if not os.path.exists(cwd):
+                            os.makedirs(cwd)
 
                     # extract stacktrace to the correct dir
                     tgz.extract(tgzfile, cwd)
@@ -81,13 +94,17 @@
                     tgz.extract(re.sub(r'\.trace$', '.input', tgzfile.name), cwd)
                     # register failure in our index
                     b.append(tgzfile.name)
+            
 
-            print("Found %d unique bugs for project %s" % (len(bugs), project))
 
-            i=0
-            for b,fs in bugs.items():
-                i += 1
-                cwd = os.path.join(outputdir,project,b,"final-failures")
-                #print(cwd)
+# Print out information about bugs for each project
+for project in projects:
+    i=0
+    print("Found %d unique bugs for project %s" % (len(bugs[project]), project))
+    for b,fs in bugs[project].items():
+        i += 1
+        cwd = os.path.join(outputdir,project,b)
+        #print(cwd)
 
-                print("Bug {} was found {} times".format(b, len(next(os.walk(cwd))[1])))
+        print("Bug {} was found {} times".format(b, len(next(os.walk(cwd))[1])))
+    print("\n\n")
\ No newline at end of file