From 9510436892dcdfc9dd10d19f3a79bd3720fb67f0 Mon Sep 17 00:00:00 2001
From: tanujjain <tanujjain@users.noreply.github.com>
Date: Fri, 28 Apr 2023 16:56:19 +0000
Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20idealo/i?=
 =?UTF-8?q?magededup@4e0b15f4cd82bcfa321eb280b843e57ebc5ff154=20?=
 =?UTF-8?q?=F0=9F=9A=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 404.html                                      |  36 +-
 CONTRIBUTING/index.html                       |  36 +-
 LICENSE/index.html                            |  36 +-
 evaluation/evaluation/index.html              |  36 +-
 examples/CIFAR10_deduplication/index.html     |  38 +-
 handlers/metrics/classification/index.html    |  36 +-
 .../metrics/information_retrieval/index.html  |  36 +-
 handlers/search/bktree/index.html             |  36 +-
 handlers/search/brute_force/index.html        |  36 +-
 handlers/search/brute_force_cython/index.html |  36 +-
 handlers/search/retrieval/index.html          |  36 +-
 index.html                                    |  41 +-
 methods/cnn/index.html                        |  63 +-
 methods/hashing/index.html                    |  36 +-
 search/search_index.json                      |   2 +-
 sitemap.xml                                   |  56 +-
 sitemap.xml.gz                                | Bin 213 -> 214 bytes
 user_guide/benchmarks/index.html              |  38 +-
 user_guide/custom_model/index.html            | 798 ++++++++++++++
 user_guide/encoding_generation/index.html     |  38 +-
 user_guide/evaluating_performance/index.html  |  64 +-
 user_guide/finding_duplicates/index.html      |  36 +-
 user_guide/plotting_duplicates/index.html     |  40 +-
 utils/data_generator/index.html               |  72 +-
 utils/general_utils/index.html                |  36 +-
 utils/image_utils/index.html                  |  36 +-
 utils/logger/index.html                       |  36 +-
 utils/models/index.html                       | 996 ++++++++++++++++++
 utils/plotter/index.html                      |  38 +-
 29 files changed, 2659 insertions(+), 165 deletions(-)
 create mode 100644 user_guide/custom_model/index.html
 create mode 100644 utils/models/index.html
diff --git a/404.html b/404.html
index 6ca6042c..3a15440b 100644
--- a/404.html
+++ b/404.html
@@ -13,7 +13,7 @@
       
       
       <link rel="icon" href="/img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -101,6 +101,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -273,8 +275,8 @@
   
   
     <li class="md-nav__item">
-      <a href="/user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="/user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -300,6 +302,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="/user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="/user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -495,6 +511,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="/utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/CONTRIBUTING/index.html b/CONTRIBUTING/index.html
index d9ec838f..fcd134ca 100644
--- a/CONTRIBUTING/index.html
+++ b/CONTRIBUTING/index.html
@@ -17,7 +17,7 @@
         <link rel="next" href="../LICENSE/">
       
       <link rel="icon" href="../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -110,6 +110,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -282,8 +284,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -309,6 +311,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -504,6 +520,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/LICENSE/index.html b/LICENSE/index.html
index ac093438..adae5988 100644
--- a/LICENSE/index.html
+++ b/LICENSE/index.html
@@ -15,7 +15,7 @@
       
       
       <link rel="icon" href="../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -103,6 +103,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -275,8 +277,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -302,6 +304,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -497,6 +513,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/evaluation/evaluation/index.html b/evaluation/evaluation/index.html
index 8fd87ef1..054c5664 100644
--- a/evaluation/evaluation/index.html
+++ b/evaluation/evaluation/index.html
@@ -17,7 +17,7 @@
         <link rel="next" href="../../utils/plotter/">
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -110,6 +110,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -282,8 +284,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -309,6 +311,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -565,6 +581,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/examples/CIFAR10_deduplication/index.html b/examples/CIFAR10_deduplication/index.html
index 80497893..c877dc7b 100644
--- a/examples/CIFAR10_deduplication/index.html
+++ b/examples/CIFAR10_deduplication/index.html
@@ -11,13 +11,13 @@
       
       
       
-        <link rel="prev" href="../../utils/plotter/">
+        <link rel="prev" href="../../utils/models/">
       
       
         <link rel="next" href="../../CONTRIBUTING/">
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -110,6 +110,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -282,8 +284,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -309,6 +311,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -504,6 +520,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/handlers/metrics/classification/index.html b/handlers/metrics/classification/index.html
index 2edb658f..96d5378f 100644
--- a/handlers/metrics/classification/index.html
+++ b/handlers/metrics/classification/index.html
@@ -13,7 +13,7 @@
       
       
       <link rel="icon" href="../../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -106,6 +106,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -278,8 +280,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -305,6 +307,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -500,6 +516,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/handlers/metrics/information_retrieval/index.html b/handlers/metrics/information_retrieval/index.html
index 844ebec7..ec280f04 100644
--- a/handlers/metrics/information_retrieval/index.html
+++ b/handlers/metrics/information_retrieval/index.html
@@ -13,7 +13,7 @@
       
       
       <link rel="icon" href="../../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -106,6 +106,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -278,8 +280,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -305,6 +307,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -500,6 +516,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/handlers/search/bktree/index.html b/handlers/search/bktree/index.html
index d9e80037..8b3853d8 100644
--- a/handlers/search/bktree/index.html
+++ b/handlers/search/bktree/index.html
@@ -13,7 +13,7 @@
       
       
       <link rel="icon" href="../../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -106,6 +106,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -278,8 +280,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -305,6 +307,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -500,6 +516,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/handlers/search/brute_force/index.html b/handlers/search/brute_force/index.html
index a6d25903..8f32e1b7 100644
--- a/handlers/search/brute_force/index.html
+++ b/handlers/search/brute_force/index.html
@@ -13,7 +13,7 @@
       
       
       <link rel="icon" href="../../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -106,6 +106,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -278,8 +280,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -305,6 +307,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -500,6 +516,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/handlers/search/brute_force_cython/index.html b/handlers/search/brute_force_cython/index.html
index 08ec77c4..495e1ebf 100644
--- a/handlers/search/brute_force_cython/index.html
+++ b/handlers/search/brute_force_cython/index.html
@@ -13,7 +13,7 @@
       
       
       <link rel="icon" href="../../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -106,6 +106,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -278,8 +280,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -305,6 +307,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -500,6 +516,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/handlers/search/retrieval/index.html b/handlers/search/retrieval/index.html
index 33188505..cf6e955a 100644
--- a/handlers/search/retrieval/index.html
+++ b/handlers/search/retrieval/index.html
@@ -13,7 +13,7 @@
       
       
       <link rel="icon" href="../../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -106,6 +106,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -278,8 +280,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -305,6 +307,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -500,6 +516,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/index.html b/index.html
index 8fc507b7..0047879a 100644
--- a/index.html
+++ b/index.html
@@ -15,7 +15,7 @@
         <link rel="next" href="user_guide/finding_duplicates/">
       
       <link rel="icon" href="img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -108,6 +108,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -370,8 +372,8 @@
   
   
     <li class="md-nav__item">
-      <a href="user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -397,6 +399,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -592,6 +608,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
@@ -801,7 +831,7 @@ <h1 id="image-deduplicator-imagededup">Image Deduplicator (imagededup)</h1>
 <p>Following details the functionality provided by the package:</p>
 <ul>
 <li>Finding duplicates in a directory using one of the following algorithms:</li>
-<li><a href="https://arxiv.org/abs/1704.04861">Convolutional Neural Network</a> (CNN)</li>
+<li><a href="https://arxiv.org/abs/1905.02244#:~:text=MobileNetV3%20is%20tuned%20to%20mobile,improved%20through%20novel%20architecture%20advances.">Convolutional Neural Network</a> (CNN) - Select from several prepackaged models or provide your own custom model.</li>
 <li><a href="http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html">Perceptual hashing</a> (PHash)</li>
 <li><a href="http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html">Difference hashing</a> (DHash)</li>
 <li><a href="https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5">Wavelet hashing</a> (WHash)</li>
@@ -892,7 +922,8 @@ <h2 id="quick-start">🚀 Quick Start</h2>
                 <span class="n">filename</span><span class="o">=</span><span class="s1">&#39;ukbench00120.jpg&#39;</span><span class="p">)</span>
 </code></pre></div>
 
-<p>For more examples, refer <a href="https://github.com/idealo/imagededup/tree/master/examples">this</a> part of the
+<p>It is also possible to use your own custom models for finding duplicates using the CNN method.</p>
+<p>For examples, refer <a href="https://github.com/idealo/imagededup/tree/master/examples">this</a> part of the
 repository.</p>
 <p>For more detailed usage of the package functionality, refer: <a href="https://idealo.github.io/imagededup/">https://idealo.github.io/imagededup/</a></p>
 <h2 id="benchmarks">⏳ Benchmarks</h2>
diff --git a/methods/cnn/index.html b/methods/cnn/index.html
index 7451ab1a..f0adeba1 100644
--- a/methods/cnn/index.html
+++ b/methods/cnn/index.html
@@ -17,7 +17,7 @@
         <link rel="next" href="../hashing/">
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -110,6 +110,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -282,8 +284,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -309,6 +311,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -448,11 +464,11 @@
 </li>
         
           <li class="md-nav__item">
-  <a href="#apply_mobilenet_preprocess" class="md-nav__link">
-    apply_mobilenet_preprocess
+  <a href="#apply_preprocess" class="md-nav__link">
+    apply_preprocess
   </a>
   
-    <nav class="md-nav" aria-label="apply_mobilenet_preprocess">
+    <nav class="md-nav" aria-label="apply_preprocess">
       <ul class="md-nav__list">
         
           <li class="md-nav__item">
@@ -734,6 +750,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
@@ -879,11 +909,11 @@
 </li>
         
           <li class="md-nav__item">
-  <a href="#apply_mobilenet_preprocess" class="md-nav__link">
-    apply_mobilenet_preprocess
+  <a href="#apply_preprocess" class="md-nav__link">
+    apply_preprocess
   </a>
   
-    <nav class="md-nav" aria-label="apply_mobilenet_preprocess">
+    <nav class="md-nav" aria-label="apply_preprocess">
       <ul class="md-nav__list">
         
           <li class="md-nav__item">
@@ -1074,20 +1104,25 @@ <h2 id="class-cnn">class CNN</h2>
 <p>Encodings generation: To propagate an image through a Convolutional Neural Network architecture and generate encodings. The generated encodings can be used at a later time for deduplication. Using the method 'encode_image', the CNN encodings for a single image can be obtained while the 'encode_images' method can be used to get encodings for all images in a directory.</p>
 </li>
 <li>
-<p>Duplicate detection: Find duplicates either using the encoding mapping generated previously using 'encode_images' or using a Path to the directory that contains the images that need to be deduplicated. 'find_duplciates' and 'find_duplicates_to_remove' methods are provided to accomplish these tasks.</p>
+<p>Duplicate detection: Find duplicates either using the encoding mapping generated previously using 'encode_images' or using a Path to the directory that contains the images that need to be deduplicated. 'find_duplicates' and 'find_duplicates_to_remove' methods are provided to accomplish these tasks.</p>
 </li>
 </ul>
 <h3 id="__init__">__init__</h3>
-<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="n">verbose</span><span class="p">)</span>
+<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="n">verbose</span><span class="p">,</span> <span class="n">model_config</span><span class="p">)</span>
 </code></pre></div>
 
 <p>Initialize a pytorch MobileNet model v3 that is sliced at the last convolutional layer. Set the batch size for pytorch dataloader to be 64 samples.</p>
 <h5 id="args">Args</h5>
 <ul>
-<li><strong>verbose</strong>: Display progress bar if True else disable it. Default value is True.</li>
+<li>
+<p><strong>verbose</strong>: Display progress bar if True else disable it. Default value is True.</p>
+</li>
+<li>
+<p><strong>model_config</strong>: A CustomModel that can be used to initialize a custom PyTorch model along with the corresponding transform.</p>
+</li>
 </ul>
-<h3 id="apply_mobilenet_preprocess">apply_mobilenet_preprocess</h3>
-<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="nf">apply_mobilenet_preprocess</span><span class="p">(</span><span class="n">im_arr</span><span class="p">)</span>
+<h3 id="apply_preprocess">apply_preprocess</h3>
+<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="nf">apply_preprocess</span><span class="p">(</span><span class="n">im_arr</span><span class="p">)</span>
 </code></pre></div>
 
 <p>Apply preprocessing function for mobilenet to images.</p>
diff --git a/methods/hashing/index.html b/methods/hashing/index.html
index a59d78f7..c3bdb54a 100644
--- a/methods/hashing/index.html
+++ b/methods/hashing/index.html
@@ -17,7 +17,7 @@
         <link rel="next" href="../../evaluation/evaluation/">
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -110,6 +110,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -282,8 +284,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -309,6 +311,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -894,6 +910,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/search/search_index.json b/search/search_index.json
index ce84682b..48ee878b 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Image Deduplicator (imagededup)","text":"<p>imagededup is a python package that simplifies the task of finding exact and near duplicates in an image collection.</p> <p> </p> <p>This package provides functionality to make use of hashing algorithms that are particularly good at finding exact duplicates as well as convolutional neural networks which are also adept at finding near duplicates. An evaluation framework is also provided to judge the quality of deduplication for a given dataset.</p> <p>Following details the functionality provided by the package:</p> <ul> <li>Finding duplicates in a directory using one of the following algorithms:</li> <li>Convolutional Neural Network (CNN)</li> <li>Perceptual hashing (PHash)</li> <li>Difference hashing (DHash)</li> <li>Wavelet hashing (WHash)</li> <li>Average hashing (AHash)</li> <li>Generation of encodings for images using one of the above stated algorithms.</li> <li>Framework to evaluate effectiveness of deduplication  given a ground truth mapping.</li> <li>Plotting duplicates found for a given image file.</li> </ul> <p>Detailed documentation for the package can be found at: https://idealo.github.io/imagededup/</p> <p>imagededup is compatible with Python 3.8+ and runs on Linux, MacOS X and Windows. It is distributed under the Apache 2.0 license.</p>"},{"location":"#contents","title":"\ud83d\udcd6 Contents","text":"<ul> <li>Installation</li> <li>Quick Start</li> <li>Benchmarks</li> <li>Contribute</li> <li>Citation</li> <li>Maintainers</li> <li>License</li> </ul>"},{"location":"#installation","title":"\u2699\ufe0f Installation","text":"<p>There are two ways to install imagededup:</p> <ul> <li>Install imagededup from PyPI (recommended):</li> </ul> <pre><code>pip install imagededup\n</code></pre> <ul> <li>Install imagededup from the GitHub source:</li> </ul> <pre><code>git clone https://github.com/idealo/imagededup.git\ncd imagededup\npip install \"cython&gt;=0.29\"\npython setup.py install\n</code></pre>"},{"location":"#quick-start","title":"\ud83d\ude80 Quick Start","text":"<p>In order to find duplicates in an image directory using perceptual hashing, following workflow can be used:</p> <ul> <li>Import perceptual hashing method</li> </ul> <pre><code>from imagededup.methods import PHash\nphasher = PHash()\n</code></pre> <ul> <li>Generate encodings for all images in an image directory</li> </ul> <pre><code>encodings = phasher.encode_images(image_dir='path/to/image/directory')\n</code></pre> <ul> <li>Find duplicates using the generated encodings</li> </ul> <pre><code>duplicates = phasher.find_duplicates(encoding_map=encodings)\n</code></pre> <ul> <li>Plot duplicates obtained for a given file (eg: 'ukbench00120.jpg') using the duplicates dictionary</li> </ul> <pre><code>from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\n                duplicate_map=duplicates,\n                filename='ukbench00120.jpg')\n</code></pre> <p>The output looks as below:</p> <p> </p> <p>The complete code for the workflow is:</p> <pre><code>from imagededup.methods import PHash\nphasher = PHash()\n\n# Generate encodings for all images in an image directory\nencodings = phasher.encode_images(image_dir='path/to/image/directory')\n\n# Find duplicates using the generated encodings\nduplicates = phasher.find_duplicates(encoding_map=encodings)\n\n# plot duplicates obtained for a given file using the duplicates dictionary\nfrom imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\n                duplicate_map=duplicates,\n                filename='ukbench00120.jpg')\n</code></pre> <p>For more examples, refer this part of the repository.</p> <p>For more detailed usage of the package functionality, refer: https://idealo.github.io/imagededup/</p>"},{"location":"#benchmarks","title":"\u23f3 Benchmarks","text":"<p>Update: Provided benchmarks are only valid upto <code>imagededup v0.2.2</code>. The next releases have significant changes to all methods, so the current benchmarks may not hold.</p> <p>Detailed benchmarks on speed and classification metrics for different methods have been provided in the documentation. Generally speaking, following conclusions can be made:</p> <ul> <li>CNN works best for near duplicates and datasets containing transformations.</li> <li>All deduplication methods fare well on datasets containing exact duplicates, but Difference hashing is the fastest.</li> </ul>"},{"location":"#contribute","title":"\ud83e\udd1d Contribute","text":"<p>We welcome all kinds of contributions. See the Contribution guide for more details.</p>"},{"location":"#citation","title":"\ud83d\udcdd Citation","text":"<p>Please cite Imagededup in your publications if this is useful for your research. Here is an example BibTeX entry:</p> <pre><code>@misc{idealods2019imagededup,\ntitle={Imagededup},\nauthor={Tanuj Jain and Christopher Lennan and Zubin John and Dat Tran},\nyear={2019},\nhowpublished={\\url{https://github.com/idealo/imagededup}},\n}\n</code></pre>"},{"location":"#maintainers","title":"\ud83c\udfd7 Maintainers","text":"<ul> <li>Tanuj Jain, github: tanujjain</li> <li>Christopher Lennan, github: clennan</li> <li>Dat Tran, github: datitran</li> </ul>"},{"location":"#copyright","title":"\u00a9 Copyright","text":"<p>See LICENSE for details.</p>"},{"location":"CONTRIBUTING/","title":"Contribution Guide","text":"<p>We welcome any contributions whether it is:</p> <ul> <li>Submitting feedback</li> <li>Fixing bugs</li> <li>Or implementing a new feature.</li> </ul> <p>Please read this guide before making any contributions.</p>"},{"location":"CONTRIBUTING/#submit-feedback","title":"Submit Feedback","text":"<p>The feedback should be submitted by creating an issue on GitHub issues. Select the related template (bug report, feature request, or custom) and add the corresponding labels.</p>"},{"location":"CONTRIBUTING/#fix-bugs","title":"Fix Bugs","text":"<p>You may look through the GitHub issues for bugs.</p>"},{"location":"CONTRIBUTING/#implement-features","title":"Implement Features","text":"<p>You may look through the GitHub issues for feature requests.</p>"},{"location":"CONTRIBUTING/#pull-requests-pr","title":"Pull Requests (PR)","text":"<ol> <li>Fork the repository and create a new branch from the <code>dev</code> branch.</li> <li>For bug fixes, add new tests and for new features, please add changes to the documentation.</li> <li>Do a PR from your new branch to our <code>dev</code> branch of the original Imagededup repo.</li> </ol>"},{"location":"CONTRIBUTING/#documentation","title":"Documentation","text":"<ul> <li>Make sure any new function or class you introduce has proper docstrings.</li> </ul>"},{"location":"CONTRIBUTING/#testing","title":"Testing","text":"<ul> <li>We use pytest for our testing. Make sure to write tests for any new feature and/or bug fixes.</li> </ul>"},{"location":"CONTRIBUTING/#main-contributor-list","title":"Main Contributor List","text":"<p>We maintain a list of main contributors to appreciate all the contributions.</p>"},{"location":"LICENSE/","title":"License","text":"<p>Copyright 2019 idealo internet GmbH. All rights reserved.</p> <pre><code>                             Apache License\n                       Version 2.0, January 2004\n                    http://www.apache.org/licenses/\n</code></pre> <p>TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION</p> <ol> <li> <p>Definitions.</p> <p>\"License\" shall mean the terms and conditions for use, reproduction,   and distribution as defined by Sections 1 through 9 of this document.</p> <p>\"Licensor\" shall mean the copyright owner or entity authorized by   the copyright owner that is granting the License.</p> <p>\"Legal Entity\" shall mean the union of the acting entity and all   other entities that control, are controlled by, or are under common   control with that entity. For the purposes of this definition,   \"control\" means (i) the power, direct or indirect, to cause the   direction or management of such entity, whether by contract or   otherwise, or (ii) ownership of fifty percent (50%) or more of the   outstanding shares, or (iii) beneficial ownership of such entity.</p> <p>\"You\" (or \"Your\") shall mean an individual or Legal Entity   exercising permissions granted by this License.</p> <p>\"Source\" form shall mean the preferred form for making modifications,   including but not limited to software source code, documentation   source, and configuration files.</p> <p>\"Object\" form shall mean any form resulting from mechanical   transformation or translation of a Source form, including but   not limited to compiled object code, generated documentation,   and conversions to other media types.</p> <p>\"Work\" shall mean the work of authorship, whether in Source or   Object form, made available under the License, as indicated by a   copyright notice that is included in or attached to the work   (an example is provided in the Appendix below).</p> <p>\"Derivative Works\" shall mean any work, whether in Source or Object   form, that is based on (or derived from) the Work and for which the   editorial revisions, annotations, elaborations, or other modifications   represent, as a whole, an original work of authorship. For the purposes   of this License, Derivative Works shall not include works that remain   separable from, or merely link (or bind by name) to the interfaces of,   the Work and Derivative Works thereof.</p> <p>\"Contribution\" shall mean any work of authorship, including   the original version of the Work and any modifications or additions   to that Work or Derivative Works thereof, that is intentionally   submitted to Licensor for inclusion in the Work by the copyright owner   or by an individual or Legal Entity authorized to submit on behalf of   the copyright owner. For the purposes of this definition, \"submitted\"   means any form of electronic, verbal, or written communication sent   to the Licensor or its representatives, including but not limited to   communication on electronic mailing lists, source code control systems,   and issue tracking systems that are managed by, or on behalf of, the   Licensor for the purpose of discussing and improving the Work, but   excluding communication that is conspicuously marked or otherwise   designated in writing by the copyright owner as \"Not a Contribution.\"</p> <p>\"Contributor\" shall mean Licensor and any individual or Legal Entity   on behalf of whom a Contribution has been received by Licensor and   subsequently incorporated within the Work.</p> </li> <li> <p>Grant of Copyright License. Subject to the terms and conditions of       this License, each Contributor hereby grants to You a perpetual,       worldwide, non-exclusive, no-charge, royalty-free, irrevocable       copyright license to reproduce, prepare Derivative Works of,       publicly display, publicly perform, sublicense, and distribute the       Work and such Derivative Works in Source or Object form.</p> </li> <li> <p>Grant of Patent License. Subject to the terms and conditions of       this License, each Contributor hereby grants to You a perpetual,       worldwide, non-exclusive, no-charge, royalty-free, irrevocable       (except as stated in this section) patent license to make, have made,       use, offer to sell, sell, import, and otherwise transfer the Work,       where such license applies only to those patent claims licensable       by such Contributor that are necessarily infringed by their       Contribution(s) alone or by combination of their Contribution(s)       with the Work to which such Contribution(s) was submitted. If You       institute patent litigation against any entity (including a       cross-claim or counterclaim in a lawsuit) alleging that the Work       or a Contribution incorporated within the Work constitutes direct       or contributory patent infringement, then any patent licenses       granted to You under this License for that Work shall terminate       as of the date such litigation is filed.</p> </li> <li> <p>Redistribution. You may reproduce and distribute copies of the       Work or Derivative Works thereof in any medium, with or without       modifications, and in Source or Object form, provided that You       meet the following conditions:</p> <p>(a) You must give any other recipients of the Work or       Derivative Works a copy of this License; and</p> <p>(b) You must cause any modified files to carry prominent notices       stating that You changed the files; and</p> <p>(c) You must retain, in the Source form of any Derivative Works       that You distribute, all copyright, patent, trademark, and       attribution notices from the Source form of the Work,       excluding those notices that do not pertain to any part of       the Derivative Works; and</p> <p>(d) If the Work includes a \"NOTICE\" text file as part of its       distribution, then any Derivative Works that You distribute must       include a readable copy of the attribution notices contained       within such NOTICE file, excluding those notices that do not       pertain to any part of the Derivative Works, in at least one       of the following places: within a NOTICE text file distributed       as part of the Derivative Works; within the Source form or       documentation, if provided along with the Derivative Works; or,       within a display generated by the Derivative Works, if and       wherever such third-party notices normally appear. The contents       of the NOTICE file are for informational purposes only and       do not modify the License. You may add Your own attribution       notices within Derivative Works that You distribute, alongside       or as an addendum to the NOTICE text from the Work, provided       that such additional attribution notices cannot be construed       as modifying the License.</p> <p>You may add Your own copyright statement to Your modifications and   may provide additional or different license terms and conditions   for use, reproduction, or distribution of Your modifications, or   for any such Derivative Works as a whole, provided Your use,   reproduction, and distribution of the Work otherwise complies with   the conditions stated in this License.</p> </li> <li> <p>Submission of Contributions. Unless You explicitly state otherwise,       any Contribution intentionally submitted for inclusion in the Work       by You to the Licensor shall be under the terms and conditions of       this License, without any additional terms or conditions.       Notwithstanding the above, nothing herein shall supersede or modify       the terms of any separate license agreement you may have executed       with Licensor regarding such Contributions.</p> </li> <li> <p>Trademarks. This License does not grant permission to use the trade       names, trademarks, service marks, or product names of the Licensor,       except as required for reasonable and customary use in describing the       origin of the Work and reproducing the content of the NOTICE file.</p> </li> <li> <p>Disclaimer of Warranty. Unless required by applicable law or       agreed to in writing, Licensor provides the Work (and each       Contributor provides its Contributions) on an \"AS IS\" BASIS,       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       implied, including, without limitation, any warranties or conditions       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A       PARTICULAR PURPOSE. You are solely responsible for determining the       appropriateness of using or redistributing the Work and assume any       risks associated with Your exercise of permissions under this License.</p> </li> <li> <p>Limitation of Liability. In no event and under no legal theory,       whether in tort (including negligence), contract, or otherwise,       unless required by applicable law (such as deliberate and grossly       negligent acts) or agreed to in writing, shall any Contributor be       liable to You for damages, including any direct, indirect, special,       incidental, or consequential damages of any character arising as a       result of this License or out of the use or inability to use the       Work (including but not limited to damages for loss of goodwill,       work stoppage, computer failure or malfunction, or any and all       other commercial damages or losses), even if such Contributor       has been advised of the possibility of such damages.</p> </li> <li> <p>Accepting Warranty or Additional Liability. While redistributing       the Work or Derivative Works thereof, You may choose to offer,       and charge a fee for, acceptance of support, warranty, indemnity,       or other liability obligations and/or rights consistent with this       License. However, in accepting such obligations, You may act only       on Your own behalf and on Your sole responsibility, not on behalf       of any other Contributor, and only if You agree to indemnify,       defend, and hold each Contributor harmless for any liability       incurred by, or claims asserted against, such Contributor by reason       of your accepting any such warranty or additional liability.</p> </li> </ol> <p>END OF TERMS AND CONDITIONS</p> <p>APPENDIX: How to apply the Apache License to your work.</p> <pre><code>  To apply the Apache License to your work, attach the following\nboilerplate notice, with the fields enclosed by brackets \"[]\"\nreplaced with your own identifying information. (Don't include\n  the brackets!)  The text should be enclosed in the appropriate\n  comment syntax for the file format. We also recommend that a\n  file or class name and description of purpose be included on the\n  same \"printed page\" as the copyright notice for easier\n  identification within third-party archives.\n</code></pre> <p>Copyright [yyyy] [name of copyright owner]</p> <p>Licensed under the Apache License, Version 2.0 (the \"License\");    you may not use this file except in compliance with the License.    You may obtain a copy of the License at</p> <pre><code>   http://www.apache.org/licenses/LICENSE-2.0\n</code></pre> <p>Unless required by applicable law or agreed to in writing, software    distributed under the License is distributed on an \"AS IS\" BASIS,    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.    See the License for the specific language governing permissions and    limitations under the License.</p>"},{"location":"evaluation/evaluation/","title":"Evaluation","text":""},{"location":"evaluation/evaluation/#evaluate","title":"evaluate","text":"<pre><code>def evaluate(ground_truth_map, retrieved_map, metric)\n</code></pre> <p>Given a ground truth map and a duplicate map retrieved from a deduplication algorithm, get metrics to evaluate the effectiveness of the applied deduplication algorithm.</p>"},{"location":"evaluation/evaluation/#args","title":"Args","text":"<ul> <li> <p>ground_truth_map: A dictionary representing ground truth with filenames as key and a list of duplicate filenames                   as value.</p> </li> <li> <p>retrieved_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved                duplicate filenames as value.</p> </li> <li> <p>metric:  Name of metric to be evaluated and returned. Accepted values are: 'map', 'ndcg', 'jaccard',         'classification', 'all'(default, returns every metric).</p> </li> </ul>"},{"location":"evaluation/evaluation/#returns","title":"Returns","text":"<ul> <li>dictionary: A dictionary with metric name as key and corresponding calculated metric as the value. 'map', 'ndcg'             and 'jaccard' return a single number denoting the corresponding information retrieval metric.             'classification' metrics include 'precision', 'recall' and 'f1-score' which are returned in the form             of individual entries in the returned dictionary. The value for each of the classification metric             is a numpy array with first entry as the score for non-duplicate file pairs(class-0) and second             entry as the score for duplicate file pairs (class-1). Additionally, a support is also returned as             another key with first entry denoting number of non-duplicate file pairs and second entry having             duplicate file pairs.</li> </ul>"},{"location":"examples/CIFAR10_deduplication/","title":"CIFAR10 deduplication example","text":""},{"location":"examples/CIFAR10_deduplication/#install-imagededup-via-pypi","title":"Install imagededup via PyPI","text":"<pre><code>!pip install imagededup\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#download-cifar10-dataset-and-untar","title":"Download CIFAR10 dataset and untar","text":"<pre><code>!wget http://pjreddie.com/media/files/cifar.tgz\n!tar xzf cifar.tgz\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#create-working-directory-and-move-all-images-into-this-directory","title":"Create working directory and move all images into this directory","text":"<pre><code>image_dir = 'cifar10_images'\n!mkdir $image_dir\n!cp -r '/content/cifar/train/.' $image_dir\n!cp -r '/content/cifar/test/.' $image_dir\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#find-duplicates-in-the-entire-dataset-with-cnn","title":"Find duplicates in the entire dataset with CNN","text":"<pre><code>from imagededup.methods import CNN\n\ncnn = CNN()\nencodings = cnn.encode_images(image_dir=image_dir)\nduplicates = cnn.find_duplicates(encoding_map=encodings)\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#do-some-imports-for-plotting","title":"Do some imports for plotting","text":"<pre><code>from pathlib import Path\nfrom imagededup.utils import plot_duplicates\nimport matplotlib.pyplot as plt\nplt.rcParams['figure.figsize'] = (15, 10)\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#find-and-plot-duplicates-in-the-test-set-with-cnn","title":"Find and plot duplicates in the test set with CNN","text":"<pre><code># test images are stored under '/content/cifar/test'\nfilenames_test = set([i.name for i in Path('/content/cifar/test').glob('*.png')])\n\nduplicates_test = {}\nfor k, v in duplicates.items():\n  if k in filenames_test:\n    tmp = [i for i in v if i in filenames_test]\n    duplicates_test[k] = tmp\n\n# sort in descending order of duplicates\nduplicates_test = {k: v for k, v in sorted(duplicates_test.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_test, filename=list(duplicates_test.keys())[0])\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#find-and-plot-duplicates-in-the-train-set-with-cnn","title":"Find and plot duplicates in the train set with CNN","text":"<pre><code># train images are stored under '/content/cifar/train'\nfilenames_train = set([i.name for i in Path('/content/cifar/train').glob('*.png')])\n\nduplicates_train = {}\nfor k, v in duplicates.items():\n  if k in filenames_train:\n    tmp = [i for i in v if i in filenames_train]\n    duplicates_train[k] = tmp\n\n\n# sort in descending order of duplicates\nduplicates_train = {k: v for k, v in sorted(duplicates_train.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_train, filename=list(duplicates_train.keys())[0])\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#examples-from-test-set-with-duplicates-in-train-set","title":"Examples from test set with duplicates in train set","text":"<pre><code># keep only filenames that are in test set have duplicates in train set\nduplicates_test_train = {}\nfor k, v in duplicates.items():\n    if k in filenames_test:\n        tmp = [i for i in v if i in filenames_train]\n        duplicates_test_train[k] = tmp\n\n# sort in descending order of duplicates\nduplicates_test_train = {k: v for k, v in sorted(duplicates_test_train.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_test_train, filename=list(duplicates_test_train.keys())[0])\n</code></pre>"},{"location":"handlers/metrics/classification/","title":"Classification","text":""},{"location":"handlers/metrics/classification/#classification_metrics","title":"classification_metrics","text":"<pre><code>def classification_metrics(ground_truth, retrieved)\n</code></pre> <p>Given ground truth dictionary and retrieved dictionary, return per class precision, recall and f1 score. Class 1 is assigned to duplicate file pairs while class 0 is for non-duplicate file pairs.</p>"},{"location":"handlers/metrics/classification/#args","title":"Args","text":"<ul> <li> <p>ground_truth: A dictionary representing ground truth with filenames as key and a list of duplicate filenames</p> </li> <li> <p>retrieved: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved</p> </li> </ul>"},{"location":"handlers/metrics/classification/#returns","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/","title":"Information retrieval","text":""},{"location":"handlers/metrics/information_retrieval/#avg_prec","title":"avg_prec","text":"<pre><code>def avg_prec(correct_duplicates, retrieved_duplicates)\n</code></pre> <p>Get average precision(AP) for a single query given correct and retrieved file names.</p>"},{"location":"handlers/metrics/information_retrieval/#args","title":"Args","text":"<ul> <li> <p>correct_duplicates: List of correct duplicates i.e., ground truth)</p> </li> <li> <p>retrieved_duplicates: List of retrieved duplicates for one single query</p> </li> </ul>"},{"location":"handlers/metrics/information_retrieval/#returns","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#ndcg","title":"ndcg","text":"<pre><code>def ndcg(correct_duplicates, retrieved_duplicates)\n</code></pre> <p>Get Normalized discounted cumulative gain(NDCG) for a single query given correct and retrieved file names.</p>"},{"location":"handlers/metrics/information_retrieval/#args_1","title":"Args","text":"<ul> <li> <p>correct_duplicates: List of correct duplicates i.e., ground truth)</p> </li> <li> <p>retrieved_duplicates: List of retrieved duplicates for one single query</p> </li> </ul>"},{"location":"handlers/metrics/information_retrieval/#returns_1","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#jaccard_similarity","title":"jaccard_similarity","text":"<pre><code>def jaccard_similarity(correct_duplicates, retrieved_duplicates)\n</code></pre> <p>Get jaccard similarity for a single query given correct and retrieved file names.</p>"},{"location":"handlers/metrics/information_retrieval/#args_2","title":"Args","text":"<ul> <li> <p>correct_duplicates: List of correct duplicates i.e., ground truth)</p> </li> <li> <p>retrieved_duplicates: List of retrieved duplicates for one single query</p> </li> </ul>"},{"location":"handlers/metrics/information_retrieval/#returns_2","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#mean_metric","title":"mean_metric","text":"<pre><code>def mean_metric(ground_truth, retrieved, metric)\n</code></pre> <p>Get mean of specified metric.</p>"},{"location":"handlers/metrics/information_retrieval/#args_3","title":"Args","text":"<ul> <li>metric_func: metric function on which mean is to be calculated across all queries</li> </ul>"},{"location":"handlers/metrics/information_retrieval/#returns_3","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#get_all_metrics","title":"get_all_metrics","text":"<pre><code>def get_all_metrics(ground_truth, retrieved)\n</code></pre> <p>Get mean of all information retrieval metrics across all queries.</p>"},{"location":"handlers/metrics/information_retrieval/#args_4","title":"Args","text":"<ul> <li> <p>ground_truth: A dictionary representing ground truth with filenames as key and a list of duplicate filenames</p> </li> <li> <p>retrieved: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved</p> </li> </ul>"},{"location":"handlers/metrics/information_retrieval/#returns_4","title":"Returns","text":""},{"location":"handlers/search/bktree/","title":"Bktree","text":""},{"location":"handlers/search/bktree/#class-bktreenode","title":"class BkTreeNode","text":"<p>Class to contain the attributes of a single node in the BKTree.</p>"},{"location":"handlers/search/bktree/#__init__","title":"__init__","text":"<pre><code>def __init__(node_name, node_value, parent_name)\n</code></pre>"},{"location":"handlers/search/bktree/#class-bktree","title":"class BKTree","text":"<p>Class to construct and perform search using a BKTree.</p>"},{"location":"handlers/search/bktree/#__init___1","title":"__init__","text":"<pre><code>def __init__(hash_dict, distance_function)\n</code></pre> <p>Initialize a root for the BKTree and triggers the tree construction using the dictionary for mapping file names and corresponding hashes.</p>"},{"location":"handlers/search/bktree/#args","title":"Args","text":"<ul> <li> <p>hash_dict:  Dictionary mapping file names to corresponding hash strings {filename: hash}</p> </li> <li> <p>distance_function: A function for calculating distance between the hashes.</p> </li> </ul>"},{"location":"handlers/search/bktree/#construct_tree","title":"construct_tree","text":"<pre><code>def construct_tree()\n</code></pre> <p>Construct the BKTree.</p>"},{"location":"handlers/search/bktree/#search","title":"search","text":"<pre><code>def search(query, tol)\n</code></pre> <p>Function to search the bktree given a hash of the query image.</p>"},{"location":"handlers/search/bktree/#args_1","title":"Args","text":"<ul> <li> <p>query: hash string for which BKTree needs to be searched.</p> </li> <li> <p>tol: distance upto which duplicate is valid.</p> </li> </ul>"},{"location":"handlers/search/bktree/#returns","title":"Returns","text":"<ul> <li>List of tuples of the form [(valid_retrieval_filename1:  distance), (valid_retrieval_filename2: distance)]</li> </ul>"},{"location":"handlers/search/brute_force/","title":"Brute force","text":""},{"location":"handlers/search/brute_force/#class-bruteforce","title":"class BruteForce","text":"<p>Class to perform search using a Brute force.</p>"},{"location":"handlers/search/brute_force/#__init__","title":"__init__","text":"<pre><code>def __init__(hash_dict, distance_function)\n</code></pre> <p>Initialize a dictionary for mapping file names and corresponding hashes and a distance function to be used for getting distance between two hash strings.</p>"},{"location":"handlers/search/brute_force/#args","title":"Args","text":"<ul> <li> <p>hash_dict:  Dictionary mapping file names to corresponding hash strings {filename: hash}</p> </li> <li> <p>distance_function: A function for calculating distance between the hashes.</p> </li> </ul>"},{"location":"handlers/search/brute_force/#search","title":"search","text":"<pre><code>def search(query, tol)\n</code></pre> <p>Function for searching using brute force.</p>"},{"location":"handlers/search/brute_force/#args_1","title":"Args","text":"<ul> <li> <p>query: hash string for which brute force needs to work.</p> </li> <li> <p>tol: distance upto which duplicate is valid.</p> </li> </ul>"},{"location":"handlers/search/brute_force/#returns","title":"Returns","text":"<ul> <li>List of tuples of the form [(valid_retrieval_filename1:  distance), (valid_retrieval_filename2: distance)]</li> </ul>"},{"location":"handlers/search/brute_force_cython/","title":"Brute force cython","text":""},{"location":"handlers/search/brute_force_cython/#class-bruteforcecython","title":"class BruteForceCython","text":"<p>Class to perform search using a Brute force.</p>"},{"location":"handlers/search/brute_force_cython/#__init__","title":"__init__","text":"<pre><code>def __init__(hash_dict, distance_function)\n</code></pre> <p>Initialize a dictionary for mapping file names and corresponding hashes and a distance function to be used for getting distance between two hash strings.</p>"},{"location":"handlers/search/brute_force_cython/#args","title":"Args","text":"<ul> <li> <p>hash_dict:  Dictionary mapping file names to corresponding hash strings {filename: hash}</p> </li> <li> <p>distance_function: A function for calculating distance between the hashes.</p> </li> </ul>"},{"location":"handlers/search/brute_force_cython/#search","title":"search","text":"<pre><code>def search(query, tol)\n</code></pre> <p>Function for searching using brute force.</p>"},{"location":"handlers/search/brute_force_cython/#args_1","title":"Args","text":"<ul> <li> <p>query: hash string for which brute force needs to work.</p> </li> <li> <p>tol: distance upto which duplicate is valid.</p> </li> </ul>"},{"location":"handlers/search/brute_force_cython/#returns","title":"Returns","text":"<ul> <li>List of tuples of the form [(valid_retrieval_filename1:  distance), (valid_retrieval_filename2: distance)]</li> </ul>"},{"location":"handlers/search/retrieval/","title":"Retrieval","text":""},{"location":"handlers/search/retrieval/#cosine_similarity_chunk","title":"cosine_similarity_chunk","text":"<pre><code>def cosine_similarity_chunk(t)\n</code></pre>"},{"location":"handlers/search/retrieval/#get_cosine_similarity","title":"get_cosine_similarity","text":"<pre><code>def get_cosine_similarity(X, verbose, chunk_size, threshold, num_workers)\n</code></pre>"},{"location":"handlers/search/retrieval/#class-hasheval","title":"class HashEval","text":""},{"location":"handlers/search/retrieval/#__init__","title":"__init__","text":"<pre><code>def __init__(test, queries, distance_function, verbose, threshold, search_method, num_dist_workers)\n</code></pre> <p>Initialize a HashEval object which offers an interface to control hashing and search methods for desired dataset. Compute a map of duplicate images in the document space given certain input control parameters.</p>"},{"location":"handlers/search/retrieval/#retrieve_results","title":"retrieve_results","text":"<pre><code>def retrieve_results(scores)\n</code></pre> <p>Return results with or without scores.</p>"},{"location":"handlers/search/retrieval/#args","title":"Args","text":"<ul> <li>scores: Boolean indicating whether results are to eb returned with or without scores.</li> </ul>"},{"location":"handlers/search/retrieval/#returns","title":"Returns","text":"<ul> <li> <p>if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg',</p> </li> <li> <p>score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}</p> </li> <li> <p>if scores is False, then a dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg',</p> </li> <li> <p>'image1_duplicate2.jpg'], 'image2.jpg': ['image1_duplicate1.jpg',..], ..}</p> </li> </ul>"},{"location":"methods/cnn/","title":"CNN","text":""},{"location":"methods/cnn/#class-cnn","title":"class CNN","text":"<p>Find duplicates using CNN and/or generate CNN encodings given a single image or a directory of images.</p> <p>The module can be used for 2 purposes: Encoding generation and duplicate detection.</p> <ul> <li> <p>Encodings generation: To propagate an image through a Convolutional Neural Network architecture and generate encodings. The generated encodings can be used at a later time for deduplication. Using the method 'encode_image', the CNN encodings for a single image can be obtained while the 'encode_images' method can be used to get encodings for all images in a directory.</p> </li> <li> <p>Duplicate detection: Find duplicates either using the encoding mapping generated previously using 'encode_images' or using a Path to the directory that contains the images that need to be deduplicated. 'find_duplciates' and 'find_duplicates_to_remove' methods are provided to accomplish these tasks.</p> </li> </ul>"},{"location":"methods/cnn/#__init__","title":"__init__","text":"<pre><code>def __init__(verbose)\n</code></pre> <p>Initialize a pytorch MobileNet model v3 that is sliced at the last convolutional layer. Set the batch size for pytorch dataloader to be 64 samples.</p>"},{"location":"methods/cnn/#args","title":"Args","text":"<ul> <li>verbose: Display progress bar if True else disable it. Default value is True.</li> </ul>"},{"location":"methods/cnn/#apply_mobilenet_preprocess","title":"apply_mobilenet_preprocess","text":"<pre><code>def apply_mobilenet_preprocess(im_arr)\n</code></pre> <p>Apply preprocessing function for mobilenet to images.</p>"},{"location":"methods/cnn/#args_1","title":"Args","text":"<ul> <li>im_arr: Image typecast to numpy array.</li> </ul>"},{"location":"methods/cnn/#returns","title":"Returns","text":"<ul> <li>transformed_image_tensor: Transformed images returned as a pytorch tensor.</li> </ul>"},{"location":"methods/cnn/#encode_image","title":"encode_image","text":"<pre><code>def encode_image(image_file, image_array)\n</code></pre> <p>Generate CNN encoding for a single image.</p>"},{"location":"methods/cnn/#args_2","title":"Args","text":"<ul> <li> <p>image_file: Path to the image file.</p> </li> <li> <p>image_array: Optional, used instead of image_file. Image typecast to numpy array.</p> </li> </ul>"},{"location":"methods/cnn/#returns_1","title":"Returns","text":"<ul> <li>encoding: Encodings for the image in the form of numpy array.</li> </ul>"},{"location":"methods/cnn/#example-usage","title":"Example usage","text":"<pre><code>from imagededup.methods import CNN\nmyencoder = CNN()\nencoding = myencoder.encode_image(image_file='path/to/image.jpg')\nOR\nencoding = myencoder.encode_image(image_array=&lt;numpy array of image&gt;)\n</code></pre>"},{"location":"methods/cnn/#encode_images","title":"encode_images","text":"<pre><code>def encode_images(image_dir, recursive, num_enc_workers)\n</code></pre> <p>Generate CNN encodings for all images in a given directory of images. Test.</p>"},{"location":"methods/cnn/#args_3","title":"Args","text":"<ul> <li> <p>image_dir: Path to the image directory.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/cnn/#returns_2","title":"Returns","text":"<ul> <li>dictionary: Contains a mapping of filenames and corresponding numpy array of CNN encodings.</li> </ul>"},{"location":"methods/cnn/#example-usage_1","title":"Example usage","text":"<pre><code>from imagededup.methods import CNN\nmyencoder = CNN()\nencoding_map = myencoder.encode_images(image_dir='path/to/image/directory')\n</code></pre>"},{"location":"methods/cnn/#find_duplicates","title":"find_duplicates","text":"<pre><code>def find_duplicates(image_dir, encoding_map, min_similarity_threshold, scores, outfile, recursive, num_enc_workers, num_sim_workers)\n</code></pre> <p>Find duplicates for each file. Take in path of the directory or encoding dictionary in which duplicates are to be detected above the given threshold. Return dictionary containing key as filename and value as a list of duplicate file names. Optionally, the cosine distances could be returned instead of just duplicate filenames for each query file.</p>"},{"location":"methods/cnn/#args_4","title":"Args","text":"<ul> <li> <p>image_dir: Path to the directory containing all the images or dictionary with keys as file names</p> </li> <li> <p>encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and               corresponding CNN encodings.</p> </li> <li> <p>min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9</p> </li> <li> <p>scores: Optional, boolean indicating whether similarity scores are to be returned along with retrieved         duplicates.</p> </li> <li> <p>outfile: Optional, name of the file to save the results, must be a json. Default is None.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.</p> </li> <li> <p>num_sim_workers: Optional, number of cpu cores to use for multiprocessing similarity computation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/cnn/#returns_3","title":"Returns","text":"<ul> <li>dictionary:  if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg',             score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}. if scores is False, then a             dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg', 'image1_duplicate2.jpg'],             'image2.jpg':['image1_duplicate1.jpg',..], ..}</li> </ul>"},{"location":"methods/cnn/#example-usage_2","title":"Example usage","text":"<pre><code>from imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates(image_dir='path/to/directory', min_similarity_threshold=0.85, scores=True,\noutfile='results.json')\n\nOR\n\nfrom imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates(encoding_map=&lt;mapping filename to cnn encodings&gt;,\nmin_similarity_threshold=0.85, scores=True, outfile='results.json')\n</code></pre>"},{"location":"methods/cnn/#find_duplicates_to_remove","title":"find_duplicates_to_remove","text":"<pre><code>def find_duplicates_to_remove(image_dir, encoding_map, min_similarity_threshold, outfile, recursive, num_enc_workers, num_sim_workers)\n</code></pre> <p>Give out a list of image file names to remove based on the similarity threshold. Does not remove the mentioned files.</p>"},{"location":"methods/cnn/#args_5","title":"Args","text":"<ul> <li> <p>image_dir: Path to the directory containing all the images or dictionary with keys as file names            and values as numpy arrays which represent the CNN encoding for the key image file.</p> </li> <li> <p>encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and               corresponding CNN encodings.</p> </li> <li> <p>min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9</p> </li> <li> <p>outfile: Optional, name of the file to save the results, must be a json. Default is None.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.</p> </li> <li> <p>num_sim_workers: Optional, number of cpu cores to use for multiprocessing similarity computation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/cnn/#returns_4","title":"Returns","text":"<ul> <li>duplicates: List of image file names that should be removed.</li> </ul>"},{"location":"methods/cnn/#example-usage_3","title":"Example usage","text":"<pre><code>from imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'),\nmin_similarity_threshold=0.85)\n\nOR\n\nfrom imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates_to_remove(encoding_map=&lt;mapping filename to cnn encodings&gt;,\nmin_similarity_threshold=0.85, outfile='results.json')\n</code></pre>"},{"location":"methods/hashing/","title":"Hashing","text":""},{"location":"methods/hashing/#class-hashing","title":"class Hashing","text":"<p>Find duplicates using hashing algorithms and/or generate hashes given a single image or a directory of images.</p> <p>The module can be used for 2 purposes: Encoding generation and duplicate detection.</p> <ul> <li> <p>Encoding generation: To generate hashes using specific hashing method. The generated hashes can be used at a later time for deduplication. Using the method 'encode_image' from the specific hashing method object, the hash for a single image can be obtained while the 'encode_images' method can be used to get hashes for all images in a directory.</p> </li> <li> <p>Duplicate detection: Find duplicates either using the encoding mapping generated previously using 'encode_images' or using a Path to the directory that contains the images that need to be deduplicated. 'find_duplicates' and 'find_duplicates_to_remove' methods are provided to accomplish these tasks.</p> </li> </ul>"},{"location":"methods/hashing/#__init__","title":"__init__","text":"<pre><code>def __init__(verbose)\n</code></pre> <p>Initialize hashing class.</p>"},{"location":"methods/hashing/#args","title":"Args","text":"<ul> <li>verbose: Display progress bar if True else disable it. Default value is True.</li> </ul>"},{"location":"methods/hashing/#hamming_distance","title":"hamming_distance","text":"<pre><code>def hamming_distance(hash1, hash2)\n</code></pre> <p>Calculate the hamming distance between two hashes. If length of hashes is not 64 bits, then pads the length to be 64 for each hash and then calculates the hamming distance.</p>"},{"location":"methods/hashing/#args_1","title":"Args","text":"<ul> <li> <p>hash1: hash string</p> </li> <li> <p>hash2: hash string</p> </li> </ul>"},{"location":"methods/hashing/#returns","title":"Returns","text":"<ul> <li>hamming_distance: Hamming distance between the two hashes.</li> </ul>"},{"location":"methods/hashing/#encode_image","title":"encode_image","text":"<pre><code>def encode_image(image_file, image_array)\n</code></pre> <p>Generate hash for a single image.</p>"},{"location":"methods/hashing/#args_2","title":"Args","text":"<ul> <li> <p>image_file: Path to the image file.</p> </li> <li> <p>image_array: Optional, used instead of image_file. Image typecast to numpy array.</p> </li> </ul>"},{"location":"methods/hashing/#returns_1","title":"Returns","text":"<ul> <li>hash: A 16 character hexadecimal string hash for the image.</li> </ul>"},{"location":"methods/hashing/#example-usage","title":"Example usage","text":"<pre><code>from imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nmyhash = myencoder.encode_image(image_file='path/to/image.jpg')\nOR\nmyhash = myencoder.encode_image(image_array=&lt;numpy array of image&gt;)\n</code></pre>"},{"location":"methods/hashing/#encode_images","title":"encode_images","text":"<pre><code>def encode_images(image_dir, recursive, num_enc_workers)\n</code></pre> <p>Generate hashes for all images in a given directory of images.</p>"},{"location":"methods/hashing/#args_3","title":"Args","text":"<ul> <li> <p>image_dir: Path to the image directory.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/hashing/#returns_2","title":"Returns","text":"<ul> <li>dictionary:  A dictionary that contains a mapping of filenames and corresponding 64 character hash string             such as {'Image1.jpg': 'hash_string1', 'Image2.jpg': 'hash_string2', ...}</li> </ul>"},{"location":"methods/hashing/#example-usage_1","title":"Example usage","text":"<pre><code>from imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nmapping = myencoder.encode_images('path/to/directory')\n</code></pre>"},{"location":"methods/hashing/#find_duplicates","title":"find_duplicates","text":"<pre><code>def find_duplicates(image_dir, encoding_map, max_distance_threshold, scores, outfile, search_method, recursive, num_enc_workers, num_dist_workers)\n</code></pre> <p>Find duplicates for each file. Takes in path of the directory or encoding dictionary in which duplicates are to be detected. All images with hamming distance less than or equal to the max_distance_threshold are regarded as duplicates. Returns dictionary containing key as filename and value as a list of duplicate file names. Optionally, the below the given hamming distance could be returned instead of just duplicate filenames for each query file.</p>"},{"location":"methods/hashing/#args_4","title":"Args","text":"<ul> <li> <p>image_dir: Path to the directory containing all the images or dictionary with keys as file names            and values as hash strings for the key image file.</p> </li> <li> <p>encoding_map: Optional,  used instead of image_dir, a dictionary containing mapping of filenames and               corresponding hashes.</p> </li> <li> <p>max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are                         valid. (must be an int between 0 and 64). Default is 10.</p> </li> <li> <p>scores: Optional, boolean indicating whether Hamming distances are to be returned along with retrieved duplicates.</p> </li> <li> <p>outfile: Optional, name of the file to save the results, must be a json. Default is None.</p> </li> <li> <p>search_method: Algorithm used to retrieve duplicates. Default is brute_force_cython for Unix else bktree.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> <li> <p>num_dist_workers: Optional, number of cpu cores to use for multiprocessing distance computation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/hashing/#returns_3","title":"Returns","text":"<ul> <li>duplicates dictionary:  if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg',             score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}. if scores is False, then a             dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg', 'image1_duplicate2.jpg'],             'image2.jpg':['image1_duplicate1.jpg',..], ..}</li> </ul>"},{"location":"methods/hashing/#example-usage_2","title":"Example usage","text":"<pre><code>from imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nduplicates = myencoder.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True,\noutfile='results.json')\n\nOR\n\nfrom imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nduplicates = myencoder.find_duplicates(encoding_map=&lt;mapping filename to hashes&gt;,\nmax_distance_threshold=15, scores=True, outfile='results.json')\n</code></pre>"},{"location":"methods/hashing/#find_duplicates_to_remove","title":"find_duplicates_to_remove","text":"<pre><code>def find_duplicates_to_remove(image_dir, encoding_map, max_distance_threshold, outfile, recursive, num_enc_workers, num_dist_workers)\n</code></pre> <p>Give out a list of image file names to remove based on the hamming distance threshold threshold. Does not remove the mentioned files.</p>"},{"location":"methods/hashing/#args_5","title":"Args","text":"<ul> <li> <p>image_dir: Path to the directory containing all the images or dictionary with keys as file names            and values as hash strings for the key image file.</p> </li> <li> <p>encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and               corresponding hashes.</p> </li> <li> <p>max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are                         valid. (must be an int between 0 and 64). Default is 10.</p> </li> <li> <p>outfile: Optional, name of the file to save the results, must be a json. Default is None.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> <li> <p>num_dist_workers: Optional, number of cpu cores to use for multiprocessing distance computation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/hashing/#returns_4","title":"Returns","text":"<ul> <li>duplicates: List of image file names that are found to be duplicate of me other file in the directory.</li> </ul>"},{"location":"methods/hashing/#example-usage_3","title":"Example usage","text":"<pre><code>from imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nduplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'),\nmax_distance_threshold=15)\n\nOR\n\nfrom imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nduplicates = myencoder.find_duplicates(encoding_map=&lt;mapping filename to hashes&gt;,\nmax_distance_threshold=15, outfile='results.json')\n</code></pre>"},{"location":"methods/hashing/#class-phash","title":"class PHash","text":"<p>Inherits from Hashing base class and implements perceptual hashing (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html).</p> <p>Offers all the functionality mentioned in hashing class.</p>"},{"location":"methods/hashing/#example-usage_4","title":"Example usage","text":"<pre><code># Perceptual hash for images\nfrom imagededup.methods import PHash\nphasher = PHash()\nperceptual_hash = phasher.encode_image(image_file = 'path/to/image.jpg')\nOR\nperceptual_hash = phasher.encode_image(image_array = &lt;numpy image array&gt;)\nOR\nperceptual_hashes = phasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = phasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import PHash\nphasher = PHash()\nfiles_to_remove = phasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = phasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n</code></pre>"},{"location":"methods/hashing/#__init___1","title":"__init__","text":"<pre><code>def __init__(verbose)\n</code></pre> <p>Initialize perceptual hashing class.</p>"},{"location":"methods/hashing/#args_6","title":"Args","text":"<ul> <li>verbose: Display progress bar if True else disable it. Default value is True.</li> </ul>"},{"location":"methods/hashing/#class-ahash","title":"class AHash","text":"<p>Inherits from Hashing base class and implements average hashing. (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)</p> <p>Offers all the functionality mentioned in hashing class.</p>"},{"location":"methods/hashing/#example-usage_5","title":"Example usage","text":"<pre><code># Average hash for images\nfrom imagededup.methods import AHash\nahasher = AHash()\naverage_hash = ahasher.encode_image(image_file = 'path/to/image.jpg')\nOR\naverage_hash = ahasher.encode_image(image_array = &lt;numpy image array&gt;)\nOR\naverage_hashes = ahasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import AHash\nahasher = AHash()\nduplicates = ahasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = ahasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import AHash\nahasher = AHash()\nfiles_to_remove = ahasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = ahasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n</code></pre>"},{"location":"methods/hashing/#__init___2","title":"__init__","text":"<pre><code>def __init__(verbose)\n</code></pre> <p>Initialize average hashing class.</p>"},{"location":"methods/hashing/#args_7","title":"Args","text":"<ul> <li>verbose: Display progress bar if True else disable it. Default value is True.</li> </ul>"},{"location":"methods/hashing/#class-dhash","title":"class DHash","text":"<p>Inherits from Hashing base class and implements difference hashing. (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)</p> <p>Offers all the functionality mentioned in hashing class.</p>"},{"location":"methods/hashing/#example-usage_6","title":"Example usage","text":"<pre><code># Difference hash for images\nfrom imagededup.methods import DHash\ndhasher = DHash()\ndifference_hash = dhasher.encode_image(image_file = 'path/to/image.jpg')\nOR\ndifference_hash = dhasher.encode_image(image_array = &lt;numpy image array&gt;)\nOR\ndifference_hashes = dhasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import DHash\ndhasher = DHash()\nduplicates = dhasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = dhasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import DHash\ndhasher = DHash()\nfiles_to_remove = dhasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = dhasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n</code></pre>"},{"location":"methods/hashing/#__init___3","title":"__init__","text":"<pre><code>def __init__(verbose)\n</code></pre> <p>Initialize difference hashing class.</p>"},{"location":"methods/hashing/#args_8","title":"Args","text":"<ul> <li>verbose: Display progress bar if True else disable it. Default value is True.</li> </ul>"},{"location":"methods/hashing/#class-whash","title":"class WHash","text":"<p>Inherits from Hashing base class and implements wavelet hashing. (Implementation reference: https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5)</p> <p>Offers all the functionality mentioned in hashing class.</p>"},{"location":"methods/hashing/#example-usage_7","title":"Example usage","text":"<pre><code># Wavelet hash for images\nfrom imagededup.methods import WHash\nwhasher = WHash()\nwavelet_hash = whasher.encode_image(image_file = 'path/to/image.jpg')\nOR\nwavelet_hash = whasher.encode_image(image_array = &lt;numpy image array&gt;)\nOR\nwavelet_hashes = whasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import WHash\nwhasher = WHash()\nduplicates = whasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = whasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import WHash\nwhasher = WHash()\nfiles_to_remove = whasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = whasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n</code></pre>"},{"location":"methods/hashing/#__init___4","title":"__init__","text":"<pre><code>def __init__(verbose)\n</code></pre> <p>Initialize wavelet hashing class.</p>"},{"location":"methods/hashing/#args_9","title":"Args","text":"<ul> <li>verbose: Display progress bar if True else disable it. Default value is True.</li> </ul>"},{"location":"user_guide/benchmarks/","title":"Benchmarks","text":"<p>To gauge an idea of the speed and accuracy of the implemented algorithms, a benchmark has been provided on the UKBench dataset (zip file titled 'UKBench image collection' having size ~1.5G) and some variations derived from it.</p>"},{"location":"user_guide/benchmarks/#datasets","title":"Datasets","text":"<p>3 datasets that have been used:</p> <ol> <li> <p>Near duplicate dataset (UKBench dataset): This dataset has near duplicates  that are arranged in groups of 4. There are a total of 2550 such groups amounting to a total   of 10200 RGB images. The size of each image is 640 x 480 with <code>jpg</code> extension. The image below depicts 3 example groups   from the UKBench dataset. Each row represents a group with the corresponding 4 images from the group.</p> <p><p> </p> </p> </li> <li> <p>Transformed dataset derived from UKBench dataset: An image from different groups of the UKBench  dataset was taken and the following 5 transformations were applied to the original image:</p> <ul> <li>Random crop preserving the original aspect ratio (new size - 560 x 420)</li> <li>Horizontal flip</li> <li>Vertical flip</li> <li>25 degree rotation</li> <li>Resizing with change in aspect ratio (new aspect ratio - 1:1)</li> </ul> <p>Thus, each group has a total of 6 images (original +  transformed). A total of 1800 such groups were created totalling 10800 images in the dataset.</p> </li> <li> <p>Exact duplicate dataset: An image from each of the 2550 image groups of the UKBench dataset was  taken and an exact duplicate was created. The number of images totalled 5100.</p> </li> </ol>"},{"location":"user_guide/benchmarks/#environment","title":"Environment","text":"<p>The benchmarks were performed on an AWS ec2 r5.xlarge instance having 4  vCPUs and 32 GB memory. The instance does not have a GPU, so all the runs are done on CPUs.</p>"},{"location":"user_guide/benchmarks/#metrics","title":"Metrics","text":"<p>The metrics used here are classification metrics as explained in the documentation.</p> <p>class-0 refers to non-duplicate image pairs.</p> <p>class-1 refers to duplicate image pairs.</p> <p>The reported numbers are rounded off to nearest 3 digits.</p>"},{"location":"user_guide/benchmarks/#timings","title":"Timings","text":"<p>The times are reported in seconds and comprise the time taken to generate encodings and find duplicates. The time taken to perform the evaluation task is NOT reported.</p>"},{"location":"user_guide/benchmarks/#threshold-selection","title":"Threshold selection","text":"<p>For each method, 3 different thresholds have been selected.</p> <p>For hashing methods, following <code>max_distance_threshold</code> values are used:</p> <ul> <li>0: Indicates that exactly the same hash should be generated for the image pairs to be considered duplicates.</li> <li>10: Default.</li> <li>32: Halfway between the maximum and minimum values (0 and 64).</li> </ul> <p>For cnn method, following <code>min_similarity_threshold</code> values are used:</p> <ul> <li>1.0: Indicates that exactly the same cnn embeddings should be generated for the image pairs to be considered duplicates.</li> <li>0.9: Default.</li> <li>0.5: A threshold that allows large deviation between image pairs.</li> </ul>"},{"location":"user_guide/benchmarks/#results","title":"Results","text":""},{"location":"user_guide/benchmarks/#near-duplicate-dataset","title":"Near Duplicate dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 35.570 0.999 0.0 1.0 0.0 dhash 10 35.810 0.999 0.018 0.999 0.0461 dhash 32 106.670 0.998 0.0 0.326 0.884 phash 0 40.073 0.999 1.0 1.0 0.0 phash 10 39.056 0.999 0.498 0.999 0.016 phash 32 98.835 0.998 0.0 0.343 0.856 ahash 0 36.171 0.999 0.282 0.999 0.002 ahash 10 36.560 0.999 0.012 0.996 0.193 ahash 32 97.170 0.999 0.000 0.448 0.932 whash 0 51.710 0.999 0.112 0.999 0.002 whash 10 51.940 0.999 0.008 0.993 0.199 whash 32 112.560 0.999 0.0 0.416 0.933 cnn 0.5 379.680 0.999 0.0 0.856 0.999 cnn 0.9 377.157 0.999 0.995 0.999 0.127 cnn 1.0 379.570 0.999 0.0 1.0 0.0"},{"location":"user_guide/benchmarks/#observations","title":"Observations","text":"<ul> <li>The cnn method with a threshold between 0.5 and 0.9 would work best for finding near duplicates. This is indicated by  the extreme values class-1 precision and recall takes for the two thresholds.</li> <li>Hashing methods do not perform well for finding near duplicates.</li> </ul>"},{"location":"user_guide/benchmarks/#transformed-dataset","title":"Transformed dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 25.360 0.999 1.0 1.0 0.040 dhash 10 25.309 0.999 0.138 0.999 0.117 dhash 32 108.960 0.990 0.0 0.336 0.872 phash 0 28.069 0.999 1.0 1.0 0.050 phash 10 28.075 0.999 0.341 0.999 0.079 phash 32 107.079 0.990 0.003 0.328 0.847 ahash 0 25.270 0.999 0.961 0.999 0.058 ahash 10 25.389 0.999 0.035 0.997 0.216 ahash 32 93.084 0.990 0.0 0.441 0.849 whash 0 40.390 0.999 0.917 0.999 0.061 whash 10 41.260 0.999 0.023 0.996 0.203 whash 32 109.630 0.990 0.0 0.410 0.853 cnn 0.5 397.380 0.999 0.003 0.852 0.999 cnn 0.9 392.090 0.999 0.999 0.990 0.384 cnn 1.0 396.250 0.990 0.0 1.0 0.0"},{"location":"user_guide/benchmarks/#observations_1","title":"Observations","text":"<ul> <li>The cnn method with threshold 0.9 seems to work best for finding transformed duplicates. A slightly lower <code>min_similarity_threshold</code> value could lead to a higher class-1 recall.</li> <li>Hashing methods do not perform well for finding transformed duplicates. In reality, resized images get found easily, but all other transformations lead to a bad performance for hashing methods.</li> </ul>"},{"location":"user_guide/benchmarks/#exact-duplicates-dataset","title":"Exact duplicates dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 18.380 1.0 1.0 1.0 1.0 dhash 10 18.410 1.0 0.223 0.999 1.0 dhash 32 34.602 1.0 0.0 0.327 1.0 phash 0 19.780 1.0 1.0 1.0 1.0 phash 10 20.012 1.0 0.980 0.999 1.0 phash 32 34.054 1.0 0.0 0.344 1.0 ahash 0 18.180 1.0 0.998 0.999 1.0 ahash 10 18.228 1.0 0.044 0.995 1.0 ahash 32 31.961 1.0 0.0 0.448 1.0 whash 0 26.097 1.0 0.980 0.999 1.0 whash 10 26.056 1.0 0.029 0.993 1.0 whash 32 39.408 1.0 0.0 0.417 1.0 cnn 0.5 192.050 1.0 0.001 0.860 1.0 cnn 0.9 191.024 1.0 1.0 1.0 1.0 cnn 1.0 194.270 0.999 1.0 1.0 0.580* <p>* The value is low as opposed to the expected 1.0 because of the <code>cosine_similarity</code>  function from scikit-learn (used within the package) which sometimes calculates the similarity to be slightly less than 1.0 even when the vectors are same.</p>"},{"location":"user_guide/benchmarks/#observations_2","title":"Observations","text":"<ul> <li>Difference hashing is the fastest (<code>max_distance_threshold</code> 0).</li> <li>When using hashing methods for exact duplicates, keep <code>max_distance_threshold</code> to a low value. The value of 0 is  good, but a slightly higher value should also work fine.</li> <li>When using cnn method, keep <code>min_similarity_threshold</code> to a high value. The default value of 0.9 seems to work well. A slightly higher value can also be used.</li> </ul>"},{"location":"user_guide/benchmarks/#summary","title":"Summary","text":"<ul> <li>Near duplicate dataset: use cnn with an appropriate <code>min_similarity_threshold</code>.</li> <li>Transformed dataset: use cnn with <code>min_similarity_threshold</code> of around 0.9 (default).</li> <li>Exact duplicates dataset: use Difference hashing with 0 <code>max_distance_threshold</code>.</li> <li>A higher <code>max_distance_threshold</code> (i.e., hashing) leads to a higher execution time. cnn method doesn't seem much affected by  the <code>min_similarity_threshold</code> (though a lower value would add a few seconds to the execution time as can be seen in all the  runs above.)</li> <li>Generally speaking, the cnn method takes longer to run as compared to hashing methods for all datasets. If a GPU is available, cnn method should be much faster.</li> </ul>"},{"location":"user_guide/encoding_generation/","title":"Encoding generation","text":"<p>It might be desirable to only generate the hashes/cnn encodings for a given image or all images in a directory instead of directly deduplicating using find_duplicates method. Encodings can be generated for a directory of images or for a  single image:</p> <ul> <li>Encoding generation for all images in a directory</li> <li>Encoding generation for a single image</li> </ul>"},{"location":"user_guide/encoding_generation/#encoding-generation-for-all-images-in-a-directory","title":"Encoding generation for all images in a directory","text":"<p>To generate encodings for all images in an image directory encode_images function can be used. The general api for  using encode_images is:</p> <pre><code>from imagededup.methods import &lt;method-name&gt;\nmethod_object = &lt;method-name&gt;()\nencodings = method_object.encode_images(image_dir='path/to/image/directory')\n</code></pre> <p>where the returned variable encodings is a dictionary mapping image file names to corresponding encoding:</p> <pre><code>{\n  'image1.jpg': &lt;encoding-image-1&gt;,\n  'image2.jpg': &lt;encoding-image-2&gt;,\n   ..\n}\n</code></pre> <p>For hashing algorithms, the encodings are 64 bit hashes represented as 16 character hexadecimal strings.</p> <p>For cnn, the encodings are numpy array with shape (576,).</p> <p>The 'method-name' corresponds to one of the deduplication methods available and can be set to:</p> <ul> <li>PHash</li> <li>AHash</li> <li>DHash</li> <li>WHash</li> <li>CNN</li> </ul>"},{"location":"user_guide/encoding_generation/#options","title":"Options","text":"<ul> <li>image_dir: Path to the image directory for which encodings are to be generated.</li> <li>recursive: finding images recursively in a nested directory structure, set to False by default.</li> </ul>"},{"location":"user_guide/encoding_generation/#considerations","title":"Considerations","text":"<ul> <li>If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, there is no entry  for the image in the returned encodings dictionary.</li> <li>Supported image formats: 'JPEG', 'PNG', 'BMP', 'MPO', 'PPM', 'TIFF', 'GIF', 'SVG', 'PGM', 'PBM', 'WEBP'.</li> </ul>"},{"location":"user_guide/encoding_generation/#examples","title":"Examples","text":"<p>Generating encodings using Difference hash:</p> <pre><code>from imagededup.methods import DHash\ndhasher = DHash()\nencodings = dhasher.encode_images(image_dir='path/to/image/directory')\n</code></pre>"},{"location":"user_guide/encoding_generation/#encoding-generation-for-a-single-image","title":"Encoding generation for a single image","text":"<p>To generate encodings for a single image encode_image function can be used. The general api for  using encode_image is:</p> <pre><code>from imagededup.methods import &lt;method-name&gt;\nmethod_object = &lt;method-name&gt;()\nencoding = method_object.encode_image(image_file='path/to/image/file')\n</code></pre> <p>where the returned variable encoding is either a hexadecimal string if a hashing method is used or a (576,) numpy  array if cnn is used.</p>"},{"location":"user_guide/encoding_generation/#options_1","title":"Options","text":"<ul> <li>image_file: Optional, path to the image file for which encodings are to be generated.</li> <li>image_array: Optional, used instead of image_file attribute. A numpy array representing the image.</li> </ul>"},{"location":"user_guide/encoding_generation/#considerations_1","title":"Considerations","text":"<ul> <li>If the image can't be loaded, no encodings are generated for the image and None is returned.</li> <li>Supported image formats: 'JPEG', 'PNG', 'BMP', 'MPO', 'PPM', 'TIFF', 'GIF', 'SVG', 'PGM', 'PBM', 'WEBP'.</li> </ul>"},{"location":"user_guide/encoding_generation/#examples_1","title":"Examples","text":"<p>Generating encodings using Difference hash:</p> <pre><code>from imagededup.methods import DHash\ndhasher = DHash()\nencoding = dhasher.encode_image(image_file='path/to/image/file')\n</code></pre>"},{"location":"user_guide/evaluating_performance/","title":"Evaluation of deduplication quality","text":"<p>To determine the quality of deduplication algorithm and the corresponding threshold, an evaluation framework is provided.</p> <p>Given a ground truth mapping consisting of file names and a list of duplicates for each file along with a retrieved  mapping from the deduplication algorithm for the same files, the following metrics can be obtained using the framework:</p> <ul> <li>Mean Average Precision (MAP)</li> <li>Mean Normalized Discounted Cumulative Gain (NDCG)</li> <li>Jaccard Index</li> <li>Per class Precision (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)</li> <li>Per class Recall (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)</li> <li>Per class f1-score (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)</li> </ul> <p>The api for obtaining these metrics  is as below:</p> <pre><code>from imagededup.evaluation import evaluate\nmetrics = evaluate(ground_truth_map, retrieved_map, metric='&lt;metric-name&gt;')\n</code></pre> <p>where the returned variable metrics is a dictionary containing the following content:</p> <pre><code>{\n  'map': &lt;map&gt;,\n  'ndcg': &lt;mean ndcg&gt;,\n  'jaccard': &lt;mean jaccard index&gt;,\n  'precision': &lt;numpy array having per class precision&gt;,\n  'recall': &lt;numpy array having per class recall&gt;,\n  'f1-score': &lt;numpy array having per class f1-score&gt;,\n  'support': &lt;numpy array having per class support&gt;\n}\n</code></pre>"},{"location":"user_guide/evaluating_performance/#options","title":"Options","text":"<ul> <li>ground_truth_map:  A dictionary representing ground truth with filenames as key and a list of duplicate filenames as  value.</li> <li>retrieved_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved duplicate  filenames as value.</li> <li>metric: Can take one of the following values:<ul> <li>'map'</li> <li>'ndcg'</li> <li>'jaccard'</li> <li>'classification': Returns per class precision, recall, f1-score, support</li> <li>'all' (default, returns all the above metrics)</li> </ul> </li> </ul>"},{"location":"user_guide/evaluating_performance/#considerations","title":"Considerations","text":"<ul> <li> <p>Presently, the ground truth map should be prepared manually by the user. Symmetric relations between duplicates must  be represented in the ground truth map. If an image i is a duplicate of image j, then j must also be represented  as a duplicate of i. Absence of symmetric relations will lead to an exception.</p> </li> <li> <p>Both the ground_truth_map and retrieved_map must have the same keys.</p> </li> <li> <p>There is a difference between the way information retrieval metrics(map, ndcg, jaccard index) and classification  metrics(precision, recall, f1-score) treat the symmetric relationships in duplicates. Consider the following  ground_truth_map and retrieved_map:</p> </li> </ul> <p>ground_truth_map:</p> <pre><code>{\n  '1.jpg': ['2.jpg', '4.jpg'],\n  '2.jpg': ['1.jpg'],\n  '3.jpg': [],\n  '4.jpg': ['1.jpg']\n}\n</code></pre> <p>retrieved_map:</p> <pre><code>{\n  '1.jpg': ['2.jpg'],\n  '2.jpg': ['1.jpg'],\n  '3.jpg': [],\n  '4.jpg': []\n}\n</code></pre> <p>From the above, it can be seen that images '1.jpg' and '4.jpg' are not found to be duplicates of each other by the  deduplication algorithm.</p> <p>For calculating information retrieval metrics, each key in the maps is considered as an independent 'query'.  In the ground truth, '4.jpg' is a duplicate of the key '1.jpg'. When it is not retrieved, it is considered a miss for  query '1.jpg'.  Similarly, '1.jpg' is a duplicate of the key '4.jpg' in the ground truth. When this is not retrieved,  it is considered a miss for query '4.jpg'.  Thus, the missing relationship is accounted for twice instead of just once.</p> <p>Classification metrics, on the other hand, consider the relationships only once by forming unique pairs of images and  labelling each pair as a 0 (non-duplicate image pair) and 1 (duplicate image pair). </p> <p>Using the ground_truth_map, the ground truth pairs with the corresponding labels are:</p> Image Pair Label ('1.jpg', '2.jpg') 1 ('1.jpg', '3.jpg') 0 ('1.jpg', '4.jpg') 1 ('2.jpg', '3.jpg') 0 ('2.jpg', '4.jpg') 0 ('3.jpg', '4.jpg') 0 <p>Similarly, using retrieved_map, the retrieved pairs are generated:</p> Image Pair Label ('1.jpg', '2.jpg') 1 ('1.jpg', '3.jpg') 0 ('1.jpg', '4.jpg') 0 ('2.jpg', '3.jpg') 0 ('2.jpg', '4.jpg') 0 ('3.jpg', '4.jpg') 0 <p>These two sets of pairs are then used to calculate metrics such as precision/recall/f1-score. It can be seen that the  missing relationship between pair ('1jpg', '4.jpg') is accounted for only once.</p>"},{"location":"user_guide/finding_duplicates/","title":"Finding duplicates","text":"<p>There are two methods available to find duplicates:</p> <ul> <li>find_duplicates()</li> <li>find_duplicates_to_remove()</li> </ul>"},{"location":"user_guide/finding_duplicates/#find_duplicates","title":"find_duplicates()","text":"<p>To find duplicates in an image directory, the general api is:</p> <pre><code>from imagededup.methods import &lt;method-name&gt;\nmethod_object = &lt;method-name&gt;()\nduplicates = method_object.find_duplicates(image_dir='path/to/image/directory',\n                                           &lt;threshold-parameter-value&gt;)\n</code></pre> <p>Duplicates can also be found if encodings of the images are available:</p> <pre><code>from imagededup.methods import &lt;method-name&gt;\nmethod_object = &lt;method-name&gt;()\nduplicates = method_object.find_duplicates(encoding_map,\n                                           &lt;threshold-parameter-value&gt;)\n</code></pre> <p>where the returned variable duplicates is a dictionary with the following content:</p> <pre><code>{\n  'image1.jpg': ['image1_duplicate1.jpg',\n                'image1_duplicate2.jpg'],\n  'image2.jpg': [..],\n  ..\n}\n</code></pre> <p>Each key in the duplicates dictionary corresponds to a file in the image directory passed to the image_dir parameter of the find_duplicates function. The value is a list of all file names in the image directory that were found to be  duplicates for the key file. The 'method-name' corresponds to one of the deduplication methods available and can be set to:</p> <ul> <li>PHash</li> <li>AHash</li> <li>DHash</li> <li>WHash</li> <li>CNN</li> </ul>"},{"location":"user_guide/finding_duplicates/#options","title":"Options","text":"<ul> <li> <p>image_dir: Optional, directory where all image files are present.</p> </li> <li> <p>encoding_map: Optional, used instead of image_dir attribute. Set it equal to the dictionary of file names and  corresponding encodings (hashes/cnn encodings). The mentioned dictionary can be generated using the corresponding  encode_images method.</p> </li> <li>scores: Setting it to True returns the scores representing the hamming distance (for hashing) or cosine similarity  (for cnn) of each of the duplicate file names from the key file. In this case, the returned 'duplicates' dictionary has   the following content:</li> </ul> <pre><code>{\n  'image1.jpg': [('image1_duplicate1.jpg', score),\n                 ('image1_duplicate2.jpg', score)],\n  'image2.jpg': [..],\n  ..\n}\n</code></pre> <p>Each key in the duplicates dictionary corresponds to a file in the image directory passed to the image_dir parameter  of the find_duplicates function. The value is a list of tuples representing the file names and corresponding scores  in the image directory that were found to be duplicates of the key file.</p> <ul> <li> <p>outfile: Name of file to which the returned duplicates dictionary is to be written, must be a json. None by default.</p> </li> <li> <p>threshold parameter:</p> <ul> <li> <p>min_similarity_threshold for cnn method indicating the minimum amount of cosine similarity that should exist    between the key image and a candidate image so that the candidate image can be considered as a duplicate of the key    image. Should be a float between -1.0 and 1.0. Default value is 0.9.</p> </li> <li> <p>max_distance_threshold for hashing methods indicating the maximum amount of hamming distance that can exist    between the key image and a candidate image so that the candidate image can be considered as a duplicate of the key    image. Should be an int between 0 and 64. Default value is 10.</p> </li> </ul> </li> <li> <p>recursive: finding images recursively in a nested directory structure, set to False by default.</p> </li> </ul>"},{"location":"user_guide/finding_duplicates/#considerations","title":"Considerations","text":"<ul> <li>The returned duplicates dictionary contains symmetric relationships i.e., if an image i is a duplicate of image j,  then image j must also be a duplicate of image i. Let's say that the image directory only consists of images i   and j, then the duplicates dictionary would have the following content:</li> </ul> <pre><code>{\n  'i': ['j'],\n  'j': ['i']\n}\n</code></pre> <ul> <li>If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, the image is  disregarded for deduplication and has no entry in the returned duplicates dictionary.</li> </ul>"},{"location":"user_guide/finding_duplicates/#examples","title":"Examples","text":"<p>To deduplicate an image directory using perceptual hashing, with a maximum allowed hamming distance of 12, scores  returned along with duplicate filenames and the returned dictionary saved to file 'my_duplicates.json', use the  following:</p> <pre><code>from imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates(image_dir='path/to/image/directory',\n                                     max_distance_threshold=12, \n                                     scores=True, \n                                     outfile='my_duplicates.json')\n</code></pre> <p>To deduplicate an image directory using cnn, with a minimum cosine similarity of 0.85, no scores returned and the  returned dictionary saved to file 'my_duplicates.json', use the following:</p> <pre><code>from imagededup.methods import CNN\ncnn_encoder = CNN()\nduplicates = cnn_encoder.find_duplicates(image_dir='path/to/image/directory', \n                                         min_similarity_threshold=0.85, \n                                         scores=False, \n                                         outfile='my_duplicates.json')\n</code></pre>"},{"location":"user_guide/finding_duplicates/#find_duplicates_to_remove","title":"find_duplicates_to_remove()","text":"<p>Returns a list of files in the image directory that are considered as duplicates. Does NOT remove the said files.</p> <p>The api is similar to find_duplicates function (except the score attribute in find_duplicates). This function  allows the return of a single list of file names in directory that are found to be duplicates. The general api for the method is as below:</p> <pre><code>from imagededup.methods import &lt;method-name&gt;\nmethod_object = &lt;method-name&gt;()\nduplicates = method_object.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                                     &lt;threshold-parameter-value&gt;)\nOR\n\nduplicates = method_object.find_duplicates_to_remove(encoding_map=encoding_map, \n                                                     &lt;threshold-parameter-value&gt;)\n</code></pre> <p>In this case, the returned variable duplicates is a list containing the name of image files that are found to be  duplicates of some file in the directory:</p> <pre><code>[\n  'image1_duplicate1.jpg',\n  'image1_duplicate2.jpg'\n  ,..\n]\n</code></pre> <p>The 'method-name' corresponds to one of the deduplication methods available and can be set to:</p> <ul> <li>PHash</li> <li>AHash</li> <li>DHash</li> <li>WHash</li> <li>CNN</li> </ul>"},{"location":"user_guide/finding_duplicates/#options_1","title":"Options","text":"<ul> <li> <p>image_dir: Optional, directory where all image files are present.</p> </li> <li> <p>encoding_map: Optional, used instead of image_dir attribute. Set it equal to the dictionary of file names and  corresponding encodings (hashes/cnn encodings). The mentioned dictionary can be generated using the corresponding  encode_images method.</p> </li> <li> <p>outfile: Name of file to which the returned duplicates dictionary is to be written, must be a json. None by default.</p> </li> <li> <p>threshold parameter:</p> <ul> <li> <p>min_similarity_threshold for cnn method indicating the minimum amount of cosine similarity that should exist    between the key image and a candidate image so that the candidate image can be considered as a duplicate for the key    image. Should be a float between -1.0 and 1.0. Default value is 0.9.</p> </li> <li> <p>max_distance_threshold for hashing methods indicating the maximum amount of hamming distance that can exist    between the key image and a candidate image so that the candidate image can be considered as a duplicate for the key    image. Should be an int between 0 and 64. Default value is 10.</p> </li> </ul> </li> <li> <p>recursive: finding images recursively in a nested directory structure, set to False by default.</p> </li> </ul>"},{"location":"user_guide/finding_duplicates/#considerations_1","title":"Considerations","text":"<ul> <li>This method must be used with caution. The symmetric nature of duplicates imposes an issue of marking one image as  duplicate and the other as original. Consider the following duplicates dictionary:</li> </ul> <pre><code>{\n  '1.jpg': ['2.jpg'],\n  '2.jpg': ['1.jpg', '3.jpg'],\n  '3.jpg': ['2.jpg']\n}\n</code></pre> <p>In this case, it is possible to remove only 2.jpg which leaves 1.jpg and 3.jpg as non-duplicates of each other.  However, it is also possible to remove both 1.jpg and 3.jpg leaving only 2.jpg. The find_duplicates_to_remove  method can thus, return either of the outputs. In the above example, let's say that 1.jpg is retained, while its  duplicate, 2.jpg, is marked as a duplicate. Once 2.jpg is marked as duplicate, its own found duplicates would be  disregarded.  Thus, 1.jpg and 3.jpg would not be considered as duplicates. So, the final return would be:</p> <pre><code>['2.jpg']\n</code></pre> <p>This leaves 1.jpg and 3.jpg as non-duplicates in the directory. If the user does not wish to impose this heuristic, it is advised to use find_duplicates function  and use a custom heuristic to mark a file as duplicate.</p> <ul> <li>If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, the image is  disregarded for deduplication and has no entry in the returned duplicates dictionary.</li> </ul>"},{"location":"user_guide/finding_duplicates/#examples_1","title":"Examples","text":"<p>To deduplicate an image directory using perceptual hashing, with a maximum allowed hamming distance of 12, and the  returned list saved to file 'my_duplicates.json', use the following:</p> <pre><code>from imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                               max_distance_threshold=12, \n                                               outfile='my_duplicates.json')\n</code></pre> <p>To deduplicate an image directory using cnn, with a minimum cosine similarity of 0.85 and the returned list saved to  file 'my_duplicates.json', use the following:</p> <pre><code>from imagededup.methods import CNN\ncnn_encoder = CNN()\nduplicates = cnn_encoder.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                                   min_similarity_threshold=0.85, \n                                                   outfile='my_duplicates.json')\n</code></pre>"},{"location":"user_guide/plotting_duplicates/","title":"Plotting duplicates of an image","text":"<p>Once a duplicate dictionary corresponding to an image directory has been obtained (using find_duplicates), duplicates  for an image can be plotted using plot_duplicates method as below:</p> <pre><code>from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir, duplicate_map, filename, outfile=None)\n</code></pre> <p>where filename is the file for which duplicates are to be plotted.</p>"},{"location":"user_guide/plotting_duplicates/#options","title":"Options","text":"<ul> <li> <p>image_dir: Directory where all image files are present.</p> </li> <li> <p>duplicate_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved duplicate  filenames as value. A duplicate_map with scores can also be passed (obtained from find_duplicates function with scores attribute set to True).</p> </li> <li> <p>filename: Image file name for which duplicates are to be plotted.</p> </li> <li> <p>outfile: Optional, name of the file the plot should be saved to. None by default.</p> </li> </ul> <p>The output looks as below:</p> <p></p>"},{"location":"utils/data_generator/","title":"Data generator","text":""},{"location":"utils/data_generator/#img_dataloader","title":"img_dataloader","text":"<pre><code>def img_dataloader(image_dir, batch_size, basenet_preprocess, recursive, num_workers)\n</code></pre>"},{"location":"utils/data_generator/#class-imgdataset","title":"class ImgDataset","text":""},{"location":"utils/data_generator/#__init__","title":"__init__","text":"<pre><code>def __init__(image_dir, basenet_preprocess, recursive)\n</code></pre>"},{"location":"utils/data_generator/#__len__","title":"__len__","text":"<pre><code>def __len__()\n</code></pre> <p>Number of images.</p>"},{"location":"utils/data_generator/#__getitem__","title":"__getitem__","text":"<pre><code>def __getitem__(item)\n</code></pre>"},{"location":"utils/data_generator/#class-mobilenetv3","title":"class MobilenetV3","text":""},{"location":"utils/data_generator/#__init___1","title":"__init__","text":"<pre><code>def __init__()\n</code></pre>"},{"location":"utils/data_generator/#forward","title":"forward","text":"<pre><code>def forward(x)\n</code></pre>"},{"location":"utils/general_utils/","title":"General utils","text":""},{"location":"utils/general_utils/#get_files_to_remove","title":"get_files_to_remove","text":"<pre><code>def get_files_to_remove(duplicates)\n</code></pre> <p>Get a list of files to remove.</p>"},{"location":"utils/general_utils/#args","title":"Args","text":"<ul> <li>duplicates: A dictionary with file name as key and a list of duplicate file names as value.</li> </ul>"},{"location":"utils/general_utils/#returns","title":"Returns","text":""},{"location":"utils/general_utils/#save_json","title":"save_json","text":"<pre><code>def save_json(results, filename, float_scores)\n</code></pre> <p>Save results with a filename.</p>"},{"location":"utils/general_utils/#args_1","title":"Args","text":"<ul> <li> <p>results: Dictionary of results to be saved.</p> </li> <li> <p>filename: Name of the file to be saved.</p> </li> <li> <p>float_scores: boolean to indicate if scores are floats.</p> </li> </ul>"},{"location":"utils/general_utils/#parallelise","title":"parallelise","text":"<pre><code>def parallelise(function, data, verbose, num_workers)\n</code></pre>"},{"location":"utils/general_utils/#generate_files","title":"generate_files","text":"<pre><code>def generate_files(image_dir, recursive)\n</code></pre>"},{"location":"utils/general_utils/#generate_relative_names","title":"generate_relative_names","text":"<pre><code>def generate_relative_names(image_dir, files)\n</code></pre>"},{"location":"utils/image_utils/","title":"Image utils","text":""},{"location":"utils/image_utils/#check_image_array_hash","title":"check_image_array_hash","text":"<pre><code>def check_image_array_hash(image_arr)\n</code></pre> <p>Checks the sanity of the input image numpy array for hashing functions.</p>"},{"location":"utils/image_utils/#args","title":"Args","text":"<ul> <li>image_arr: Image array.</li> </ul>"},{"location":"utils/image_utils/#expand_image_array_cnn","title":"expand_image_array_cnn","text":"<pre><code>def expand_image_array_cnn(image_arr)\n</code></pre> <p>Checks the sanity of the input image numpy array for cnn and converts the grayscale numpy array to rgb by repeating the array thrice along the 3rd dimension if a 2-dimensional image array is provided.</p>"},{"location":"utils/image_utils/#args_1","title":"Args","text":"<ul> <li>image_arr: Image array.</li> </ul>"},{"location":"utils/image_utils/#returns","title":"Returns","text":""},{"location":"utils/image_utils/#preprocess_image","title":"preprocess_image","text":"<pre><code>def preprocess_image(image, target_size, grayscale)\n</code></pre> <p>Take as input an image as numpy array or Pillow format. Returns an array version of optionally resized and grayed image.</p>"},{"location":"utils/image_utils/#args_2","title":"Args","text":"<ul> <li> <p>image: numpy array or a pillow image.</p> </li> <li> <p>target_size: Size to resize the input image to.</p> </li> <li> <p>grayscale: A boolean indicating whether to grayscale the image.</p> </li> </ul>"},{"location":"utils/image_utils/#returns_1","title":"Returns","text":""},{"location":"utils/image_utils/#load_image","title":"load_image","text":"<pre><code>def load_image(image_file, target_size, grayscale, img_formats)\n</code></pre> <p>Load an image given its path. Returns an array version of optionally resized and grayed image. Only allows images of types described by img_formats argument.</p>"},{"location":"utils/image_utils/#args_3","title":"Args","text":"<ul> <li> <p>image_file: Path to the image file.</p> </li> <li> <p>target_size: Size to resize the input image to.</p> </li> <li> <p>grayscale: A boolean indicating whether to grayscale the image.</p> </li> <li> <p>img_formats: List of allowed image formats that can be loaded.</p> </li> </ul>"},{"location":"utils/logger/","title":"Logger","text":""},{"location":"utils/logger/#return_logger","title":"return_logger","text":"<pre><code>def return_logger(name)\n</code></pre>"},{"location":"utils/plotter/","title":"Plot duplicates","text":""},{"location":"utils/plotter/#plot_duplicates","title":"plot_duplicates","text":"<pre><code>def plot_duplicates(image_dir, duplicate_map, filename, outfile)\n</code></pre> <p>Given filename for an image, plot duplicates along with the original image using the duplicate map obtained using find_duplicates method.</p>"},{"location":"utils/plotter/#args","title":"Args","text":"<ul> <li> <p>image_dir: image directory where all files in duplicate_map are present.</p> </li> <li> <p>duplicate_map: mapping of filename to found duplicates (could be with or without scores).</p> </li> <li> <p>filename: Name of the file for which duplicates are to be plotted, must be a key in the duplicate_map.</p> </li> <li> <p>outfile: Optional, name of the file to save the plot. Default is None.</p> </li> </ul>"},{"location":"utils/plotter/#example-usage","title":"Example usage","text":"<pre><code>from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\nduplicate_map=duplicate_map,\nfilename='path/to/image.jpg')\n</code></pre>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Image Deduplicator (imagededup)","text":"<p>imagededup is a python package that simplifies the task of finding exact and near duplicates in an image collection.</p> <p> </p> <p>This package provides functionality to make use of hashing algorithms that are particularly good at finding exact duplicates as well as convolutional neural networks which are also adept at finding near duplicates. An evaluation framework is also provided to judge the quality of deduplication for a given dataset.</p> <p>Following details the functionality provided by the package:</p> <ul> <li>Finding duplicates in a directory using one of the following algorithms:</li> <li>Convolutional Neural Network (CNN) - Select from several prepackaged models or provide your own custom model.</li> <li>Perceptual hashing (PHash)</li> <li>Difference hashing (DHash)</li> <li>Wavelet hashing (WHash)</li> <li>Average hashing (AHash)</li> <li>Generation of encodings for images using one of the above stated algorithms.</li> <li>Framework to evaluate effectiveness of deduplication  given a ground truth mapping.</li> <li>Plotting duplicates found for a given image file.</li> </ul> <p>Detailed documentation for the package can be found at: https://idealo.github.io/imagededup/</p> <p>imagededup is compatible with Python 3.8+ and runs on Linux, MacOS X and Windows. It is distributed under the Apache 2.0 license.</p>"},{"location":"#contents","title":"\ud83d\udcd6 Contents","text":"<ul> <li>Installation</li> <li>Quick Start</li> <li>Benchmarks</li> <li>Contribute</li> <li>Citation</li> <li>Maintainers</li> <li>License</li> </ul>"},{"location":"#installation","title":"\u2699\ufe0f Installation","text":"<p>There are two ways to install imagededup:</p> <ul> <li>Install imagededup from PyPI (recommended):</li> </ul> <pre><code>pip install imagededup\n</code></pre> <ul> <li>Install imagededup from the GitHub source:</li> </ul> <pre><code>git clone https://github.com/idealo/imagededup.git\ncd imagededup\npip install \"cython&gt;=0.29\"\npython setup.py install\n</code></pre>"},{"location":"#quick-start","title":"\ud83d\ude80 Quick Start","text":"<p>In order to find duplicates in an image directory using perceptual hashing, following workflow can be used:</p> <ul> <li>Import perceptual hashing method</li> </ul> <pre><code>from imagededup.methods import PHash\nphasher = PHash()\n</code></pre> <ul> <li>Generate encodings for all images in an image directory</li> </ul> <pre><code>encodings = phasher.encode_images(image_dir='path/to/image/directory')\n</code></pre> <ul> <li>Find duplicates using the generated encodings</li> </ul> <pre><code>duplicates = phasher.find_duplicates(encoding_map=encodings)\n</code></pre> <ul> <li>Plot duplicates obtained for a given file (eg: 'ukbench00120.jpg') using the duplicates dictionary</li> </ul> <pre><code>from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\n                duplicate_map=duplicates,\n                filename='ukbench00120.jpg')\n</code></pre> <p>The output looks as below:</p> <p> </p> <p>The complete code for the workflow is:</p> <pre><code>from imagededup.methods import PHash\nphasher = PHash()\n\n# Generate encodings for all images in an image directory\nencodings = phasher.encode_images(image_dir='path/to/image/directory')\n\n# Find duplicates using the generated encodings\nduplicates = phasher.find_duplicates(encoding_map=encodings)\n\n# plot duplicates obtained for a given file using the duplicates dictionary\nfrom imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\n                duplicate_map=duplicates,\n                filename='ukbench00120.jpg')\n</code></pre> <p>It is also possible to use your own custom models for finding duplicates using the CNN method.</p> <p>For examples, refer this part of the repository.</p> <p>For more detailed usage of the package functionality, refer: https://idealo.github.io/imagededup/</p>"},{"location":"#benchmarks","title":"\u23f3 Benchmarks","text":"<p>Update: Provided benchmarks are only valid upto <code>imagededup v0.2.2</code>. The next releases have significant changes to all methods, so the current benchmarks may not hold.</p> <p>Detailed benchmarks on speed and classification metrics for different methods have been provided in the documentation. Generally speaking, following conclusions can be made:</p> <ul> <li>CNN works best for near duplicates and datasets containing transformations.</li> <li>All deduplication methods fare well on datasets containing exact duplicates, but Difference hashing is the fastest.</li> </ul>"},{"location":"#contribute","title":"\ud83e\udd1d Contribute","text":"<p>We welcome all kinds of contributions. See the Contribution guide for more details.</p>"},{"location":"#citation","title":"\ud83d\udcdd Citation","text":"<p>Please cite Imagededup in your publications if this is useful for your research. Here is an example BibTeX entry:</p> <pre><code>@misc{idealods2019imagededup,\ntitle={Imagededup},\nauthor={Tanuj Jain and Christopher Lennan and Zubin John and Dat Tran},\nyear={2019},\nhowpublished={\\url{https://github.com/idealo/imagededup}},\n}\n</code></pre>"},{"location":"#maintainers","title":"\ud83c\udfd7 Maintainers","text":"<ul> <li>Tanuj Jain, github: tanujjain</li> <li>Christopher Lennan, github: clennan</li> <li>Dat Tran, github: datitran</li> </ul>"},{"location":"#copyright","title":"\u00a9 Copyright","text":"<p>See LICENSE for details.</p>"},{"location":"CONTRIBUTING/","title":"Contribution Guide","text":"<p>We welcome any contributions whether it is:</p> <ul> <li>Submitting feedback</li> <li>Fixing bugs</li> <li>Or implementing a new feature.</li> </ul> <p>Please read this guide before making any contributions.</p>"},{"location":"CONTRIBUTING/#submit-feedback","title":"Submit Feedback","text":"<p>The feedback should be submitted by creating an issue on GitHub issues. Select the related template (bug report, feature request, or custom) and add the corresponding labels.</p>"},{"location":"CONTRIBUTING/#fix-bugs","title":"Fix Bugs","text":"<p>You may look through the GitHub issues for bugs.</p>"},{"location":"CONTRIBUTING/#implement-features","title":"Implement Features","text":"<p>You may look through the GitHub issues for feature requests.</p>"},{"location":"CONTRIBUTING/#pull-requests-pr","title":"Pull Requests (PR)","text":"<ol> <li>Fork the repository and create a new branch from the <code>dev</code> branch.</li> <li>For bug fixes, add new tests and for new features, please add changes to the documentation.</li> <li>Do a PR from your new branch to our <code>dev</code> branch of the original Imagededup repo.</li> </ol>"},{"location":"CONTRIBUTING/#documentation","title":"Documentation","text":"<ul> <li>Make sure any new function or class you introduce has proper docstrings.</li> </ul>"},{"location":"CONTRIBUTING/#testing","title":"Testing","text":"<ul> <li>We use pytest for our testing. Make sure to write tests for any new feature and/or bug fixes.</li> </ul>"},{"location":"CONTRIBUTING/#main-contributor-list","title":"Main Contributor List","text":"<p>We maintain a list of main contributors to appreciate all the contributions.</p>"},{"location":"LICENSE/","title":"License","text":"<p>Copyright 2019 idealo internet GmbH. All rights reserved.</p> <pre><code>                             Apache License\n                       Version 2.0, January 2004\n                    http://www.apache.org/licenses/\n</code></pre> <p>TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION</p> <ol> <li> <p>Definitions.</p> <p>\"License\" shall mean the terms and conditions for use, reproduction,   and distribution as defined by Sections 1 through 9 of this document.</p> <p>\"Licensor\" shall mean the copyright owner or entity authorized by   the copyright owner that is granting the License.</p> <p>\"Legal Entity\" shall mean the union of the acting entity and all   other entities that control, are controlled by, or are under common   control with that entity. For the purposes of this definition,   \"control\" means (i) the power, direct or indirect, to cause the   direction or management of such entity, whether by contract or   otherwise, or (ii) ownership of fifty percent (50%) or more of the   outstanding shares, or (iii) beneficial ownership of such entity.</p> <p>\"You\" (or \"Your\") shall mean an individual or Legal Entity   exercising permissions granted by this License.</p> <p>\"Source\" form shall mean the preferred form for making modifications,   including but not limited to software source code, documentation   source, and configuration files.</p> <p>\"Object\" form shall mean any form resulting from mechanical   transformation or translation of a Source form, including but   not limited to compiled object code, generated documentation,   and conversions to other media types.</p> <p>\"Work\" shall mean the work of authorship, whether in Source or   Object form, made available under the License, as indicated by a   copyright notice that is included in or attached to the work   (an example is provided in the Appendix below).</p> <p>\"Derivative Works\" shall mean any work, whether in Source or Object   form, that is based on (or derived from) the Work and for which the   editorial revisions, annotations, elaborations, or other modifications   represent, as a whole, an original work of authorship. For the purposes   of this License, Derivative Works shall not include works that remain   separable from, or merely link (or bind by name) to the interfaces of,   the Work and Derivative Works thereof.</p> <p>\"Contribution\" shall mean any work of authorship, including   the original version of the Work and any modifications or additions   to that Work or Derivative Works thereof, that is intentionally   submitted to Licensor for inclusion in the Work by the copyright owner   or by an individual or Legal Entity authorized to submit on behalf of   the copyright owner. For the purposes of this definition, \"submitted\"   means any form of electronic, verbal, or written communication sent   to the Licensor or its representatives, including but not limited to   communication on electronic mailing lists, source code control systems,   and issue tracking systems that are managed by, or on behalf of, the   Licensor for the purpose of discussing and improving the Work, but   excluding communication that is conspicuously marked or otherwise   designated in writing by the copyright owner as \"Not a Contribution.\"</p> <p>\"Contributor\" shall mean Licensor and any individual or Legal Entity   on behalf of whom a Contribution has been received by Licensor and   subsequently incorporated within the Work.</p> </li> <li> <p>Grant of Copyright License. Subject to the terms and conditions of       this License, each Contributor hereby grants to You a perpetual,       worldwide, non-exclusive, no-charge, royalty-free, irrevocable       copyright license to reproduce, prepare Derivative Works of,       publicly display, publicly perform, sublicense, and distribute the       Work and such Derivative Works in Source or Object form.</p> </li> <li> <p>Grant of Patent License. Subject to the terms and conditions of       this License, each Contributor hereby grants to You a perpetual,       worldwide, non-exclusive, no-charge, royalty-free, irrevocable       (except as stated in this section) patent license to make, have made,       use, offer to sell, sell, import, and otherwise transfer the Work,       where such license applies only to those patent claims licensable       by such Contributor that are necessarily infringed by their       Contribution(s) alone or by combination of their Contribution(s)       with the Work to which such Contribution(s) was submitted. If You       institute patent litigation against any entity (including a       cross-claim or counterclaim in a lawsuit) alleging that the Work       or a Contribution incorporated within the Work constitutes direct       or contributory patent infringement, then any patent licenses       granted to You under this License for that Work shall terminate       as of the date such litigation is filed.</p> </li> <li> <p>Redistribution. You may reproduce and distribute copies of the       Work or Derivative Works thereof in any medium, with or without       modifications, and in Source or Object form, provided that You       meet the following conditions:</p> <p>(a) You must give any other recipients of the Work or       Derivative Works a copy of this License; and</p> <p>(b) You must cause any modified files to carry prominent notices       stating that You changed the files; and</p> <p>(c) You must retain, in the Source form of any Derivative Works       that You distribute, all copyright, patent, trademark, and       attribution notices from the Source form of the Work,       excluding those notices that do not pertain to any part of       the Derivative Works; and</p> <p>(d) If the Work includes a \"NOTICE\" text file as part of its       distribution, then any Derivative Works that You distribute must       include a readable copy of the attribution notices contained       within such NOTICE file, excluding those notices that do not       pertain to any part of the Derivative Works, in at least one       of the following places: within a NOTICE text file distributed       as part of the Derivative Works; within the Source form or       documentation, if provided along with the Derivative Works; or,       within a display generated by the Derivative Works, if and       wherever such third-party notices normally appear. The contents       of the NOTICE file are for informational purposes only and       do not modify the License. You may add Your own attribution       notices within Derivative Works that You distribute, alongside       or as an addendum to the NOTICE text from the Work, provided       that such additional attribution notices cannot be construed       as modifying the License.</p> <p>You may add Your own copyright statement to Your modifications and   may provide additional or different license terms and conditions   for use, reproduction, or distribution of Your modifications, or   for any such Derivative Works as a whole, provided Your use,   reproduction, and distribution of the Work otherwise complies with   the conditions stated in this License.</p> </li> <li> <p>Submission of Contributions. Unless You explicitly state otherwise,       any Contribution intentionally submitted for inclusion in the Work       by You to the Licensor shall be under the terms and conditions of       this License, without any additional terms or conditions.       Notwithstanding the above, nothing herein shall supersede or modify       the terms of any separate license agreement you may have executed       with Licensor regarding such Contributions.</p> </li> <li> <p>Trademarks. This License does not grant permission to use the trade       names, trademarks, service marks, or product names of the Licensor,       except as required for reasonable and customary use in describing the       origin of the Work and reproducing the content of the NOTICE file.</p> </li> <li> <p>Disclaimer of Warranty. Unless required by applicable law or       agreed to in writing, Licensor provides the Work (and each       Contributor provides its Contributions) on an \"AS IS\" BASIS,       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       implied, including, without limitation, any warranties or conditions       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A       PARTICULAR PURPOSE. You are solely responsible for determining the       appropriateness of using or redistributing the Work and assume any       risks associated with Your exercise of permissions under this License.</p> </li> <li> <p>Limitation of Liability. In no event and under no legal theory,       whether in tort (including negligence), contract, or otherwise,       unless required by applicable law (such as deliberate and grossly       negligent acts) or agreed to in writing, shall any Contributor be       liable to You for damages, including any direct, indirect, special,       incidental, or consequential damages of any character arising as a       result of this License or out of the use or inability to use the       Work (including but not limited to damages for loss of goodwill,       work stoppage, computer failure or malfunction, or any and all       other commercial damages or losses), even if such Contributor       has been advised of the possibility of such damages.</p> </li> <li> <p>Accepting Warranty or Additional Liability. While redistributing       the Work or Derivative Works thereof, You may choose to offer,       and charge a fee for, acceptance of support, warranty, indemnity,       or other liability obligations and/or rights consistent with this       License. However, in accepting such obligations, You may act only       on Your own behalf and on Your sole responsibility, not on behalf       of any other Contributor, and only if You agree to indemnify,       defend, and hold each Contributor harmless for any liability       incurred by, or claims asserted against, such Contributor by reason       of your accepting any such warranty or additional liability.</p> </li> </ol> <p>END OF TERMS AND CONDITIONS</p> <p>APPENDIX: How to apply the Apache License to your work.</p> <pre><code>  To apply the Apache License to your work, attach the following\nboilerplate notice, with the fields enclosed by brackets \"[]\"\nreplaced with your own identifying information. (Don't include\n  the brackets!)  The text should be enclosed in the appropriate\n  comment syntax for the file format. We also recommend that a\n  file or class name and description of purpose be included on the\n  same \"printed page\" as the copyright notice for easier\n  identification within third-party archives.\n</code></pre> <p>Copyright [yyyy] [name of copyright owner]</p> <p>Licensed under the Apache License, Version 2.0 (the \"License\");    you may not use this file except in compliance with the License.    You may obtain a copy of the License at</p> <pre><code>   http://www.apache.org/licenses/LICENSE-2.0\n</code></pre> <p>Unless required by applicable law or agreed to in writing, software    distributed under the License is distributed on an \"AS IS\" BASIS,    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.    See the License for the specific language governing permissions and    limitations under the License.</p>"},{"location":"evaluation/evaluation/","title":"Evaluation","text":""},{"location":"evaluation/evaluation/#evaluate","title":"evaluate","text":"<pre><code>def evaluate(ground_truth_map, retrieved_map, metric)\n</code></pre> <p>Given a ground truth map and a duplicate map retrieved from a deduplication algorithm, get metrics to evaluate the effectiveness of the applied deduplication algorithm.</p>"},{"location":"evaluation/evaluation/#args","title":"Args","text":"<ul> <li> <p>ground_truth_map: A dictionary representing ground truth with filenames as key and a list of duplicate filenames                   as value.</p> </li> <li> <p>retrieved_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved                duplicate filenames as value.</p> </li> <li> <p>metric:  Name of metric to be evaluated and returned. Accepted values are: 'map', 'ndcg', 'jaccard',         'classification', 'all'(default, returns every metric).</p> </li> </ul>"},{"location":"evaluation/evaluation/#returns","title":"Returns","text":"<ul> <li>dictionary: A dictionary with metric name as key and corresponding calculated metric as the value. 'map', 'ndcg'             and 'jaccard' return a single number denoting the corresponding information retrieval metric.             'classification' metrics include 'precision', 'recall' and 'f1-score' which are returned in the form             of individual entries in the returned dictionary. The value for each of the classification metric             is a numpy array with first entry as the score for non-duplicate file pairs(class-0) and second             entry as the score for duplicate file pairs (class-1). Additionally, a support is also returned as             another key with first entry denoting number of non-duplicate file pairs and second entry having             duplicate file pairs.</li> </ul>"},{"location":"examples/CIFAR10_deduplication/","title":"CIFAR10 deduplication example","text":""},{"location":"examples/CIFAR10_deduplication/#install-imagededup-via-pypi","title":"Install imagededup via PyPI","text":"<pre><code>!pip install imagededup\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#download-cifar10-dataset-and-untar","title":"Download CIFAR10 dataset and untar","text":"<pre><code>!wget http://pjreddie.com/media/files/cifar.tgz\n!tar xzf cifar.tgz\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#create-working-directory-and-move-all-images-into-this-directory","title":"Create working directory and move all images into this directory","text":"<pre><code>image_dir = 'cifar10_images'\n!mkdir $image_dir\n!cp -r '/content/cifar/train/.' $image_dir\n!cp -r '/content/cifar/test/.' $image_dir\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#find-duplicates-in-the-entire-dataset-with-cnn","title":"Find duplicates in the entire dataset with CNN","text":"<pre><code>from imagededup.methods import CNN\n\ncnn = CNN()\nencodings = cnn.encode_images(image_dir=image_dir)\nduplicates = cnn.find_duplicates(encoding_map=encodings)\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#do-some-imports-for-plotting","title":"Do some imports for plotting","text":"<pre><code>from pathlib import Path\nfrom imagededup.utils import plot_duplicates\nimport matplotlib.pyplot as plt\nplt.rcParams['figure.figsize'] = (15, 10)\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#find-and-plot-duplicates-in-the-test-set-with-cnn","title":"Find and plot duplicates in the test set with CNN","text":"<pre><code># test images are stored under '/content/cifar/test'\nfilenames_test = set([i.name for i in Path('/content/cifar/test').glob('*.png')])\n\nduplicates_test = {}\nfor k, v in duplicates.items():\n  if k in filenames_test:\n    tmp = [i for i in v if i in filenames_test]\n    duplicates_test[k] = tmp\n\n# sort in descending order of duplicates\nduplicates_test = {k: v for k, v in sorted(duplicates_test.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_test, filename=list(duplicates_test.keys())[0])\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#find-and-plot-duplicates-in-the-train-set-with-cnn","title":"Find and plot duplicates in the train set with CNN","text":"<pre><code># train images are stored under '/content/cifar/train'\nfilenames_train = set([i.name for i in Path('/content/cifar/train').glob('*.png')])\n\nduplicates_train = {}\nfor k, v in duplicates.items():\n  if k in filenames_train:\n    tmp = [i for i in v if i in filenames_train]\n    duplicates_train[k] = tmp\n\n\n# sort in descending order of duplicates\nduplicates_train = {k: v for k, v in sorted(duplicates_train.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_train, filename=list(duplicates_train.keys())[0])\n</code></pre>"},{"location":"examples/CIFAR10_deduplication/#examples-from-test-set-with-duplicates-in-train-set","title":"Examples from test set with duplicates in train set","text":"<pre><code># keep only filenames that are in test set have duplicates in train set\nduplicates_test_train = {}\nfor k, v in duplicates.items():\n    if k in filenames_test:\n        tmp = [i for i in v if i in filenames_train]\n        duplicates_test_train[k] = tmp\n\n# sort in descending order of duplicates\nduplicates_test_train = {k: v for k, v in sorted(duplicates_test_train.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_test_train, filename=list(duplicates_test_train.keys())[0])\n</code></pre>"},{"location":"handlers/metrics/classification/","title":"Classification","text":""},{"location":"handlers/metrics/classification/#classification_metrics","title":"classification_metrics","text":"<pre><code>def classification_metrics(ground_truth, retrieved)\n</code></pre> <p>Given ground truth dictionary and retrieved dictionary, return per class precision, recall and f1 score. Class 1 is assigned to duplicate file pairs while class 0 is for non-duplicate file pairs.</p>"},{"location":"handlers/metrics/classification/#args","title":"Args","text":"<ul> <li> <p>ground_truth: A dictionary representing ground truth with filenames as key and a list of duplicate filenames</p> </li> <li> <p>retrieved: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved</p> </li> </ul>"},{"location":"handlers/metrics/classification/#returns","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/","title":"Information retrieval","text":""},{"location":"handlers/metrics/information_retrieval/#avg_prec","title":"avg_prec","text":"<pre><code>def avg_prec(correct_duplicates, retrieved_duplicates)\n</code></pre> <p>Get average precision(AP) for a single query given correct and retrieved file names.</p>"},{"location":"handlers/metrics/information_retrieval/#args","title":"Args","text":"<ul> <li> <p>correct_duplicates: List of correct duplicates i.e., ground truth)</p> </li> <li> <p>retrieved_duplicates: List of retrieved duplicates for one single query</p> </li> </ul>"},{"location":"handlers/metrics/information_retrieval/#returns","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#ndcg","title":"ndcg","text":"<pre><code>def ndcg(correct_duplicates, retrieved_duplicates)\n</code></pre> <p>Get Normalized discounted cumulative gain(NDCG) for a single query given correct and retrieved file names.</p>"},{"location":"handlers/metrics/information_retrieval/#args_1","title":"Args","text":"<ul> <li> <p>correct_duplicates: List of correct duplicates i.e., ground truth)</p> </li> <li> <p>retrieved_duplicates: List of retrieved duplicates for one single query</p> </li> </ul>"},{"location":"handlers/metrics/information_retrieval/#returns_1","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#jaccard_similarity","title":"jaccard_similarity","text":"<pre><code>def jaccard_similarity(correct_duplicates, retrieved_duplicates)\n</code></pre> <p>Get jaccard similarity for a single query given correct and retrieved file names.</p>"},{"location":"handlers/metrics/information_retrieval/#args_2","title":"Args","text":"<ul> <li> <p>correct_duplicates: List of correct duplicates i.e., ground truth)</p> </li> <li> <p>retrieved_duplicates: List of retrieved duplicates for one single query</p> </li> </ul>"},{"location":"handlers/metrics/information_retrieval/#returns_2","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#mean_metric","title":"mean_metric","text":"<pre><code>def mean_metric(ground_truth, retrieved, metric)\n</code></pre> <p>Get mean of specified metric.</p>"},{"location":"handlers/metrics/information_retrieval/#args_3","title":"Args","text":"<ul> <li>metric_func: metric function on which mean is to be calculated across all queries</li> </ul>"},{"location":"handlers/metrics/information_retrieval/#returns_3","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#get_all_metrics","title":"get_all_metrics","text":"<pre><code>def get_all_metrics(ground_truth, retrieved)\n</code></pre> <p>Get mean of all information retrieval metrics across all queries.</p>"},{"location":"handlers/metrics/information_retrieval/#args_4","title":"Args","text":"<ul> <li> <p>ground_truth: A dictionary representing ground truth with filenames as key and a list of duplicate filenames</p> </li> <li> <p>retrieved: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved</p> </li> </ul>"},{"location":"handlers/metrics/information_retrieval/#returns_4","title":"Returns","text":""},{"location":"handlers/search/bktree/","title":"Bktree","text":""},{"location":"handlers/search/bktree/#class-bktreenode","title":"class BkTreeNode","text":"<p>Class to contain the attributes of a single node in the BKTree.</p>"},{"location":"handlers/search/bktree/#__init__","title":"__init__","text":"<pre><code>def __init__(node_name, node_value, parent_name)\n</code></pre>"},{"location":"handlers/search/bktree/#class-bktree","title":"class BKTree","text":"<p>Class to construct and perform search using a BKTree.</p>"},{"location":"handlers/search/bktree/#__init___1","title":"__init__","text":"<pre><code>def __init__(hash_dict, distance_function)\n</code></pre> <p>Initialize a root for the BKTree and triggers the tree construction using the dictionary for mapping file names and corresponding hashes.</p>"},{"location":"handlers/search/bktree/#args","title":"Args","text":"<ul> <li> <p>hash_dict:  Dictionary mapping file names to corresponding hash strings {filename: hash}</p> </li> <li> <p>distance_function: A function for calculating distance between the hashes.</p> </li> </ul>"},{"location":"handlers/search/bktree/#construct_tree","title":"construct_tree","text":"<pre><code>def construct_tree()\n</code></pre> <p>Construct the BKTree.</p>"},{"location":"handlers/search/bktree/#search","title":"search","text":"<pre><code>def search(query, tol)\n</code></pre> <p>Function to search the bktree given a hash of the query image.</p>"},{"location":"handlers/search/bktree/#args_1","title":"Args","text":"<ul> <li> <p>query: hash string for which BKTree needs to be searched.</p> </li> <li> <p>tol: distance upto which duplicate is valid.</p> </li> </ul>"},{"location":"handlers/search/bktree/#returns","title":"Returns","text":"<ul> <li>List of tuples of the form [(valid_retrieval_filename1:  distance), (valid_retrieval_filename2: distance)]</li> </ul>"},{"location":"handlers/search/brute_force/","title":"Brute force","text":""},{"location":"handlers/search/brute_force/#class-bruteforce","title":"class BruteForce","text":"<p>Class to perform search using a Brute force.</p>"},{"location":"handlers/search/brute_force/#__init__","title":"__init__","text":"<pre><code>def __init__(hash_dict, distance_function)\n</code></pre> <p>Initialize a dictionary for mapping file names and corresponding hashes and a distance function to be used for getting distance between two hash strings.</p>"},{"location":"handlers/search/brute_force/#args","title":"Args","text":"<ul> <li> <p>hash_dict:  Dictionary mapping file names to corresponding hash strings {filename: hash}</p> </li> <li> <p>distance_function: A function for calculating distance between the hashes.</p> </li> </ul>"},{"location":"handlers/search/brute_force/#search","title":"search","text":"<pre><code>def search(query, tol)\n</code></pre> <p>Function for searching using brute force.</p>"},{"location":"handlers/search/brute_force/#args_1","title":"Args","text":"<ul> <li> <p>query: hash string for which brute force needs to work.</p> </li> <li> <p>tol: distance upto which duplicate is valid.</p> </li> </ul>"},{"location":"handlers/search/brute_force/#returns","title":"Returns","text":"<ul> <li>List of tuples of the form [(valid_retrieval_filename1:  distance), (valid_retrieval_filename2: distance)]</li> </ul>"},{"location":"handlers/search/brute_force_cython/","title":"Brute force cython","text":""},{"location":"handlers/search/brute_force_cython/#class-bruteforcecython","title":"class BruteForceCython","text":"<p>Class to perform search using a Brute force.</p>"},{"location":"handlers/search/brute_force_cython/#__init__","title":"__init__","text":"<pre><code>def __init__(hash_dict, distance_function)\n</code></pre> <p>Initialize a dictionary for mapping file names and corresponding hashes and a distance function to be used for getting distance between two hash strings.</p>"},{"location":"handlers/search/brute_force_cython/#args","title":"Args","text":"<ul> <li> <p>hash_dict:  Dictionary mapping file names to corresponding hash strings {filename: hash}</p> </li> <li> <p>distance_function: A function for calculating distance between the hashes.</p> </li> </ul>"},{"location":"handlers/search/brute_force_cython/#search","title":"search","text":"<pre><code>def search(query, tol)\n</code></pre> <p>Function for searching using brute force.</p>"},{"location":"handlers/search/brute_force_cython/#args_1","title":"Args","text":"<ul> <li> <p>query: hash string for which brute force needs to work.</p> </li> <li> <p>tol: distance upto which duplicate is valid.</p> </li> </ul>"},{"location":"handlers/search/brute_force_cython/#returns","title":"Returns","text":"<ul> <li>List of tuples of the form [(valid_retrieval_filename1:  distance), (valid_retrieval_filename2: distance)]</li> </ul>"},{"location":"handlers/search/retrieval/","title":"Retrieval","text":""},{"location":"handlers/search/retrieval/#cosine_similarity_chunk","title":"cosine_similarity_chunk","text":"<pre><code>def cosine_similarity_chunk(t)\n</code></pre>"},{"location":"handlers/search/retrieval/#get_cosine_similarity","title":"get_cosine_similarity","text":"<pre><code>def get_cosine_similarity(X, verbose, chunk_size, threshold, num_workers)\n</code></pre>"},{"location":"handlers/search/retrieval/#class-hasheval","title":"class HashEval","text":""},{"location":"handlers/search/retrieval/#__init__","title":"__init__","text":"<pre><code>def __init__(test, queries, distance_function, verbose, threshold, search_method, num_dist_workers)\n</code></pre> <p>Initialize a HashEval object which offers an interface to control hashing and search methods for desired dataset. Compute a map of duplicate images in the document space given certain input control parameters.</p>"},{"location":"handlers/search/retrieval/#retrieve_results","title":"retrieve_results","text":"<pre><code>def retrieve_results(scores)\n</code></pre> <p>Return results with or without scores.</p>"},{"location":"handlers/search/retrieval/#args","title":"Args","text":"<ul> <li>scores: Boolean indicating whether results are to eb returned with or without scores.</li> </ul>"},{"location":"handlers/search/retrieval/#returns","title":"Returns","text":"<ul> <li> <p>if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg',</p> </li> <li> <p>score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}</p> </li> <li> <p>if scores is False, then a dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg',</p> </li> <li> <p>'image1_duplicate2.jpg'], 'image2.jpg': ['image1_duplicate1.jpg',..], ..}</p> </li> </ul>"},{"location":"methods/cnn/","title":"CNN","text":""},{"location":"methods/cnn/#class-cnn","title":"class CNN","text":"<p>Find duplicates using CNN and/or generate CNN encodings given a single image or a directory of images.</p> <p>The module can be used for 2 purposes: Encoding generation and duplicate detection.</p> <ul> <li> <p>Encodings generation: To propagate an image through a Convolutional Neural Network architecture and generate encodings. The generated encodings can be used at a later time for deduplication. Using the method 'encode_image', the CNN encodings for a single image can be obtained while the 'encode_images' method can be used to get encodings for all images in a directory.</p> </li> <li> <p>Duplicate detection: Find duplicates either using the encoding mapping generated previously using 'encode_images' or using a Path to the directory that contains the images that need to be deduplicated. 'find_duplicates' and 'find_duplicates_to_remove' methods are provided to accomplish these tasks.</p> </li> </ul>"},{"location":"methods/cnn/#__init__","title":"__init__","text":"<pre><code>def __init__(verbose, model_config)\n</code></pre> <p>Initialize a pytorch MobileNet model v3 that is sliced at the last convolutional layer. Set the batch size for pytorch dataloader to be 64 samples.</p>"},{"location":"methods/cnn/#args","title":"Args","text":"<ul> <li> <p>verbose: Display progress bar if True else disable it. Default value is True.</p> </li> <li> <p>model_config: A CustomModel that can be used to initialize a custom PyTorch model along with the corresponding transform.</p> </li> </ul>"},{"location":"methods/cnn/#apply_preprocess","title":"apply_preprocess","text":"<pre><code>def apply_preprocess(im_arr)\n</code></pre> <p>Apply preprocessing function for mobilenet to images.</p>"},{"location":"methods/cnn/#args_1","title":"Args","text":"<ul> <li>im_arr: Image typecast to numpy array.</li> </ul>"},{"location":"methods/cnn/#returns","title":"Returns","text":"<ul> <li>transformed_image_tensor: Transformed images returned as a pytorch tensor.</li> </ul>"},{"location":"methods/cnn/#encode_image","title":"encode_image","text":"<pre><code>def encode_image(image_file, image_array)\n</code></pre> <p>Generate CNN encoding for a single image.</p>"},{"location":"methods/cnn/#args_2","title":"Args","text":"<ul> <li> <p>image_file: Path to the image file.</p> </li> <li> <p>image_array: Optional, used instead of image_file. Image typecast to numpy array.</p> </li> </ul>"},{"location":"methods/cnn/#returns_1","title":"Returns","text":"<ul> <li>encoding: Encodings for the image in the form of numpy array.</li> </ul>"},{"location":"methods/cnn/#example-usage","title":"Example usage","text":"<pre><code>from imagededup.methods import CNN\nmyencoder = CNN()\nencoding = myencoder.encode_image(image_file='path/to/image.jpg')\nOR\nencoding = myencoder.encode_image(image_array=&lt;numpy array of image&gt;)\n</code></pre>"},{"location":"methods/cnn/#encode_images","title":"encode_images","text":"<pre><code>def encode_images(image_dir, recursive, num_enc_workers)\n</code></pre> <p>Generate CNN encodings for all images in a given directory of images. Test.</p>"},{"location":"methods/cnn/#args_3","title":"Args","text":"<ul> <li> <p>image_dir: Path to the image directory.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/cnn/#returns_2","title":"Returns","text":"<ul> <li>dictionary: Contains a mapping of filenames and corresponding numpy array of CNN encodings.</li> </ul>"},{"location":"methods/cnn/#example-usage_1","title":"Example usage","text":"<pre><code>from imagededup.methods import CNN\nmyencoder = CNN()\nencoding_map = myencoder.encode_images(image_dir='path/to/image/directory')\n</code></pre>"},{"location":"methods/cnn/#find_duplicates","title":"find_duplicates","text":"<pre><code>def find_duplicates(image_dir, encoding_map, min_similarity_threshold, scores, outfile, recursive, num_enc_workers, num_sim_workers)\n</code></pre> <p>Find duplicates for each file. Take in path of the directory or encoding dictionary in which duplicates are to be detected above the given threshold. Return dictionary containing key as filename and value as a list of duplicate file names. Optionally, the cosine distances could be returned instead of just duplicate filenames for each query file.</p>"},{"location":"methods/cnn/#args_4","title":"Args","text":"<ul> <li> <p>image_dir: Path to the directory containing all the images or dictionary with keys as file names</p> </li> <li> <p>encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and               corresponding CNN encodings.</p> </li> <li> <p>min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9</p> </li> <li> <p>scores: Optional, boolean indicating whether similarity scores are to be returned along with retrieved         duplicates.</p> </li> <li> <p>outfile: Optional, name of the file to save the results, must be a json. Default is None.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.</p> </li> <li> <p>num_sim_workers: Optional, number of cpu cores to use for multiprocessing similarity computation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/cnn/#returns_3","title":"Returns","text":"<ul> <li>dictionary:  if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg',             score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}. if scores is False, then a             dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg', 'image1_duplicate2.jpg'],             'image2.jpg':['image1_duplicate1.jpg',..], ..}</li> </ul>"},{"location":"methods/cnn/#example-usage_2","title":"Example usage","text":"<pre><code>from imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates(image_dir='path/to/directory', min_similarity_threshold=0.85, scores=True,\noutfile='results.json')\n\nOR\n\nfrom imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates(encoding_map=&lt;mapping filename to cnn encodings&gt;,\nmin_similarity_threshold=0.85, scores=True, outfile='results.json')\n</code></pre>"},{"location":"methods/cnn/#find_duplicates_to_remove","title":"find_duplicates_to_remove","text":"<pre><code>def find_duplicates_to_remove(image_dir, encoding_map, min_similarity_threshold, outfile, recursive, num_enc_workers, num_sim_workers)\n</code></pre> <p>Give out a list of image file names to remove based on the similarity threshold. Does not remove the mentioned files.</p>"},{"location":"methods/cnn/#args_5","title":"Args","text":"<ul> <li> <p>image_dir: Path to the directory containing all the images or dictionary with keys as file names            and values as numpy arrays which represent the CNN encoding for the key image file.</p> </li> <li> <p>encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and               corresponding CNN encodings.</p> </li> <li> <p>min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9</p> </li> <li> <p>outfile: Optional, name of the file to save the results, must be a json. Default is None.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.</p> </li> <li> <p>num_sim_workers: Optional, number of cpu cores to use for multiprocessing similarity computation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/cnn/#returns_4","title":"Returns","text":"<ul> <li>duplicates: List of image file names that should be removed.</li> </ul>"},{"location":"methods/cnn/#example-usage_3","title":"Example usage","text":"<pre><code>from imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'),\nmin_similarity_threshold=0.85)\n\nOR\n\nfrom imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates_to_remove(encoding_map=&lt;mapping filename to cnn encodings&gt;,\nmin_similarity_threshold=0.85, outfile='results.json')\n</code></pre>"},{"location":"methods/hashing/","title":"Hashing","text":""},{"location":"methods/hashing/#class-hashing","title":"class Hashing","text":"<p>Find duplicates using hashing algorithms and/or generate hashes given a single image or a directory of images.</p> <p>The module can be used for 2 purposes: Encoding generation and duplicate detection.</p> <ul> <li> <p>Encoding generation: To generate hashes using specific hashing method. The generated hashes can be used at a later time for deduplication. Using the method 'encode_image' from the specific hashing method object, the hash for a single image can be obtained while the 'encode_images' method can be used to get hashes for all images in a directory.</p> </li> <li> <p>Duplicate detection: Find duplicates either using the encoding mapping generated previously using 'encode_images' or using a Path to the directory that contains the images that need to be deduplicated. 'find_duplicates' and 'find_duplicates_to_remove' methods are provided to accomplish these tasks.</p> </li> </ul>"},{"location":"methods/hashing/#__init__","title":"__init__","text":"<pre><code>def __init__(verbose)\n</code></pre> <p>Initialize hashing class.</p>"},{"location":"methods/hashing/#args","title":"Args","text":"<ul> <li>verbose: Display progress bar if True else disable it. Default value is True.</li> </ul>"},{"location":"methods/hashing/#hamming_distance","title":"hamming_distance","text":"<pre><code>def hamming_distance(hash1, hash2)\n</code></pre> <p>Calculate the hamming distance between two hashes. If length of hashes is not 64 bits, then pads the length to be 64 for each hash and then calculates the hamming distance.</p>"},{"location":"methods/hashing/#args_1","title":"Args","text":"<ul> <li> <p>hash1: hash string</p> </li> <li> <p>hash2: hash string</p> </li> </ul>"},{"location":"methods/hashing/#returns","title":"Returns","text":"<ul> <li>hamming_distance: Hamming distance between the two hashes.</li> </ul>"},{"location":"methods/hashing/#encode_image","title":"encode_image","text":"<pre><code>def encode_image(image_file, image_array)\n</code></pre> <p>Generate hash for a single image.</p>"},{"location":"methods/hashing/#args_2","title":"Args","text":"<ul> <li> <p>image_file: Path to the image file.</p> </li> <li> <p>image_array: Optional, used instead of image_file. Image typecast to numpy array.</p> </li> </ul>"},{"location":"methods/hashing/#returns_1","title":"Returns","text":"<ul> <li>hash: A 16 character hexadecimal string hash for the image.</li> </ul>"},{"location":"methods/hashing/#example-usage","title":"Example usage","text":"<pre><code>from imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nmyhash = myencoder.encode_image(image_file='path/to/image.jpg')\nOR\nmyhash = myencoder.encode_image(image_array=&lt;numpy array of image&gt;)\n</code></pre>"},{"location":"methods/hashing/#encode_images","title":"encode_images","text":"<pre><code>def encode_images(image_dir, recursive, num_enc_workers)\n</code></pre> <p>Generate hashes for all images in a given directory of images.</p>"},{"location":"methods/hashing/#args_3","title":"Args","text":"<ul> <li> <p>image_dir: Path to the image directory.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/hashing/#returns_2","title":"Returns","text":"<ul> <li>dictionary:  A dictionary that contains a mapping of filenames and corresponding 64 character hash string             such as {'Image1.jpg': 'hash_string1', 'Image2.jpg': 'hash_string2', ...}</li> </ul>"},{"location":"methods/hashing/#example-usage_1","title":"Example usage","text":"<pre><code>from imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nmapping = myencoder.encode_images('path/to/directory')\n</code></pre>"},{"location":"methods/hashing/#find_duplicates","title":"find_duplicates","text":"<pre><code>def find_duplicates(image_dir, encoding_map, max_distance_threshold, scores, outfile, search_method, recursive, num_enc_workers, num_dist_workers)\n</code></pre> <p>Find duplicates for each file. Takes in path of the directory or encoding dictionary in which duplicates are to be detected. All images with hamming distance less than or equal to the max_distance_threshold are regarded as duplicates. Returns dictionary containing key as filename and value as a list of duplicate file names. Optionally, the below the given hamming distance could be returned instead of just duplicate filenames for each query file.</p>"},{"location":"methods/hashing/#args_4","title":"Args","text":"<ul> <li> <p>image_dir: Path to the directory containing all the images or dictionary with keys as file names            and values as hash strings for the key image file.</p> </li> <li> <p>encoding_map: Optional,  used instead of image_dir, a dictionary containing mapping of filenames and               corresponding hashes.</p> </li> <li> <p>max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are                         valid. (must be an int between 0 and 64). Default is 10.</p> </li> <li> <p>scores: Optional, boolean indicating whether Hamming distances are to be returned along with retrieved duplicates.</p> </li> <li> <p>outfile: Optional, name of the file to save the results, must be a json. Default is None.</p> </li> <li> <p>search_method: Algorithm used to retrieve duplicates. Default is brute_force_cython for Unix else bktree.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> <li> <p>num_dist_workers: Optional, number of cpu cores to use for multiprocessing distance computation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/hashing/#returns_3","title":"Returns","text":"<ul> <li>duplicates dictionary:  if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg',             score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}. if scores is False, then a             dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg', 'image1_duplicate2.jpg'],             'image2.jpg':['image1_duplicate1.jpg',..], ..}</li> </ul>"},{"location":"methods/hashing/#example-usage_2","title":"Example usage","text":"<pre><code>from imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nduplicates = myencoder.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True,\noutfile='results.json')\n\nOR\n\nfrom imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nduplicates = myencoder.find_duplicates(encoding_map=&lt;mapping filename to hashes&gt;,\nmax_distance_threshold=15, scores=True, outfile='results.json')\n</code></pre>"},{"location":"methods/hashing/#find_duplicates_to_remove","title":"find_duplicates_to_remove","text":"<pre><code>def find_duplicates_to_remove(image_dir, encoding_map, max_distance_threshold, outfile, recursive, num_enc_workers, num_dist_workers)\n</code></pre> <p>Give out a list of image file names to remove based on the hamming distance threshold threshold. Does not remove the mentioned files.</p>"},{"location":"methods/hashing/#args_5","title":"Args","text":"<ul> <li> <p>image_dir: Path to the directory containing all the images or dictionary with keys as file names            and values as hash strings for the key image file.</p> </li> <li> <p>encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and               corresponding hashes.</p> </li> <li> <p>max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are                         valid. (must be an int between 0 and 64). Default is 10.</p> </li> <li> <p>outfile: Optional, name of the file to save the results, must be a json. Default is None.</p> </li> <li> <p>recursive: Optional, find images recursively in a nested image directory structure, set to False by default.</p> </li> <li> <p>num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> <li> <p>num_dist_workers: Optional, number of cpu cores to use for multiprocessing distance computation, set to number of CPUs in the system by default. 0 disables multiprocessing.</p> </li> </ul>"},{"location":"methods/hashing/#returns_4","title":"Returns","text":"<ul> <li>duplicates: List of image file names that are found to be duplicate of me other file in the directory.</li> </ul>"},{"location":"methods/hashing/#example-usage_3","title":"Example usage","text":"<pre><code>from imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nduplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'),\nmax_distance_threshold=15)\n\nOR\n\nfrom imagededup.methods import &lt;hash-method&gt;\nmyencoder = &lt;hash-method&gt;()\nduplicates = myencoder.find_duplicates(encoding_map=&lt;mapping filename to hashes&gt;,\nmax_distance_threshold=15, outfile='results.json')\n</code></pre>"},{"location":"methods/hashing/#class-phash","title":"class PHash","text":"<p>Inherits from Hashing base class and implements perceptual hashing (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html).</p> <p>Offers all the functionality mentioned in hashing class.</p>"},{"location":"methods/hashing/#example-usage_4","title":"Example usage","text":"<pre><code># Perceptual hash for images\nfrom imagededup.methods import PHash\nphasher = PHash()\nperceptual_hash = phasher.encode_image(image_file = 'path/to/image.jpg')\nOR\nperceptual_hash = phasher.encode_image(image_array = &lt;numpy image array&gt;)\nOR\nperceptual_hashes = phasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = phasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import PHash\nphasher = PHash()\nfiles_to_remove = phasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = phasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n</code></pre>"},{"location":"methods/hashing/#__init___1","title":"__init__","text":"<pre><code>def __init__(verbose)\n</code></pre> <p>Initialize perceptual hashing class.</p>"},{"location":"methods/hashing/#args_6","title":"Args","text":"<ul> <li>verbose: Display progress bar if True else disable it. Default value is True.</li> </ul>"},{"location":"methods/hashing/#class-ahash","title":"class AHash","text":"<p>Inherits from Hashing base class and implements average hashing. (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)</p> <p>Offers all the functionality mentioned in hashing class.</p>"},{"location":"methods/hashing/#example-usage_5","title":"Example usage","text":"<pre><code># Average hash for images\nfrom imagededup.methods import AHash\nahasher = AHash()\naverage_hash = ahasher.encode_image(image_file = 'path/to/image.jpg')\nOR\naverage_hash = ahasher.encode_image(image_array = &lt;numpy image array&gt;)\nOR\naverage_hashes = ahasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import AHash\nahasher = AHash()\nduplicates = ahasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = ahasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import AHash\nahasher = AHash()\nfiles_to_remove = ahasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = ahasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n</code></pre>"},{"location":"methods/hashing/#__init___2","title":"__init__","text":"<pre><code>def __init__(verbose)\n</code></pre> <p>Initialize average hashing class.</p>"},{"location":"methods/hashing/#args_7","title":"Args","text":"<ul> <li>verbose: Display progress bar if True else disable it. Default value is True.</li> </ul>"},{"location":"methods/hashing/#class-dhash","title":"class DHash","text":"<p>Inherits from Hashing base class and implements difference hashing. (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)</p> <p>Offers all the functionality mentioned in hashing class.</p>"},{"location":"methods/hashing/#example-usage_6","title":"Example usage","text":"<pre><code># Difference hash for images\nfrom imagededup.methods import DHash\ndhasher = DHash()\ndifference_hash = dhasher.encode_image(image_file = 'path/to/image.jpg')\nOR\ndifference_hash = dhasher.encode_image(image_array = &lt;numpy image array&gt;)\nOR\ndifference_hashes = dhasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import DHash\ndhasher = DHash()\nduplicates = dhasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = dhasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import DHash\ndhasher = DHash()\nfiles_to_remove = dhasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = dhasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n</code></pre>"},{"location":"methods/hashing/#__init___3","title":"__init__","text":"<pre><code>def __init__(verbose)\n</code></pre> <p>Initialize difference hashing class.</p>"},{"location":"methods/hashing/#args_8","title":"Args","text":"<ul> <li>verbose: Display progress bar if True else disable it. Default value is True.</li> </ul>"},{"location":"methods/hashing/#class-whash","title":"class WHash","text":"<p>Inherits from Hashing base class and implements wavelet hashing. (Implementation reference: https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5)</p> <p>Offers all the functionality mentioned in hashing class.</p>"},{"location":"methods/hashing/#example-usage_7","title":"Example usage","text":"<pre><code># Wavelet hash for images\nfrom imagededup.methods import WHash\nwhasher = WHash()\nwavelet_hash = whasher.encode_image(image_file = 'path/to/image.jpg')\nOR\nwavelet_hash = whasher.encode_image(image_array = &lt;numpy image array&gt;)\nOR\nwavelet_hashes = whasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import WHash\nwhasher = WHash()\nduplicates = whasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = whasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import WHash\nwhasher = WHash()\nfiles_to_remove = whasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = whasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n</code></pre>"},{"location":"methods/hashing/#__init___4","title":"__init__","text":"<pre><code>def __init__(verbose)\n</code></pre> <p>Initialize wavelet hashing class.</p>"},{"location":"methods/hashing/#args_9","title":"Args","text":"<ul> <li>verbose: Display progress bar if True else disable it. Default value is True.</li> </ul>"},{"location":"user_guide/benchmarks/","title":"Benchmarks","text":"<p>To gauge an idea of the speed and accuracy of the implemented algorithms, a benchmark has been provided on the UKBench dataset (zip file titled 'UKBench image collection' having size ~1.5G) and some variations derived from it.</p>"},{"location":"user_guide/benchmarks/#datasets","title":"Datasets","text":"<p>3 datasets that have been used:</p> <ol> <li> <p>Near duplicate dataset (UKBench dataset): This dataset has near duplicates  that are arranged in groups of 4. There are a total of 2550 such groups amounting to a total   of 10200 RGB images. The size of each image is 640 x 480 with <code>jpg</code> extension. The image below depicts 3 example groups   from the UKBench dataset. Each row represents a group with the corresponding 4 images from the group.</p> <p><p> </p> </p> </li> <li> <p>Transformed dataset derived from UKBench dataset: An image from different groups of the UKBench  dataset was taken and the following 5 transformations were applied to the original image:</p> <ul> <li>Random crop preserving the original aspect ratio (new size - 560 x 420)</li> <li>Horizontal flip</li> <li>Vertical flip</li> <li>25 degree rotation</li> <li>Resizing with change in aspect ratio (new aspect ratio - 1:1)</li> </ul> <p>Thus, each group has a total of 6 images (original +  transformed). A total of 1800 such groups were created totalling 10800 images in the dataset.</p> </li> <li> <p>Exact duplicate dataset: An image from each of the 2550 image groups of the UKBench dataset was  taken and an exact duplicate was created. The number of images totalled 5100.</p> </li> </ol>"},{"location":"user_guide/benchmarks/#environment","title":"Environment","text":"<p>The benchmarks were performed on an AWS ec2 r5.xlarge instance having 4  vCPUs and 32 GB memory. The instance does not have a GPU, so all the runs are done on CPUs.</p>"},{"location":"user_guide/benchmarks/#metrics","title":"Metrics","text":"<p>The metrics used here are classification metrics as explained in the documentation.</p> <p>class-0 refers to non-duplicate image pairs.</p> <p>class-1 refers to duplicate image pairs.</p> <p>The reported numbers are rounded off to nearest 3 digits.</p>"},{"location":"user_guide/benchmarks/#timings","title":"Timings","text":"<p>The times are reported in seconds and comprise the time taken to generate encodings and find duplicates. The time taken to perform the evaluation task is NOT reported.</p>"},{"location":"user_guide/benchmarks/#threshold-selection","title":"Threshold selection","text":"<p>For each method, 3 different thresholds have been selected.</p> <p>For hashing methods, following <code>max_distance_threshold</code> values are used:</p> <ul> <li>0: Indicates that exactly the same hash should be generated for the image pairs to be considered duplicates.</li> <li>10: Default.</li> <li>32: Halfway between the maximum and minimum values (0 and 64).</li> </ul> <p>For cnn method, following <code>min_similarity_threshold</code> values are used:</p> <ul> <li>1.0: Indicates that exactly the same cnn embeddings should be generated for the image pairs to be considered duplicates.</li> <li>0.9: Default.</li> <li>0.5: A threshold that allows large deviation between image pairs.</li> </ul>"},{"location":"user_guide/benchmarks/#results","title":"Results","text":""},{"location":"user_guide/benchmarks/#near-duplicate-dataset","title":"Near Duplicate dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 35.570 0.999 0.0 1.0 0.0 dhash 10 35.810 0.999 0.018 0.999 0.0461 dhash 32 106.670 0.998 0.0 0.326 0.884 phash 0 40.073 0.999 1.0 1.0 0.0 phash 10 39.056 0.999 0.498 0.999 0.016 phash 32 98.835 0.998 0.0 0.343 0.856 ahash 0 36.171 0.999 0.282 0.999 0.002 ahash 10 36.560 0.999 0.012 0.996 0.193 ahash 32 97.170 0.999 0.000 0.448 0.932 whash 0 51.710 0.999 0.112 0.999 0.002 whash 10 51.940 0.999 0.008 0.993 0.199 whash 32 112.560 0.999 0.0 0.416 0.933 cnn 0.5 379.680 0.999 0.0 0.856 0.999 cnn 0.9 377.157 0.999 0.995 0.999 0.127 cnn 1.0 379.570 0.999 0.0 1.0 0.0"},{"location":"user_guide/benchmarks/#observations","title":"Observations","text":"<ul> <li>The cnn method with a threshold between 0.5 and 0.9 would work best for finding near duplicates. This is indicated by  the extreme values class-1 precision and recall takes for the two thresholds.</li> <li>Hashing methods do not perform well for finding near duplicates.</li> </ul>"},{"location":"user_guide/benchmarks/#transformed-dataset","title":"Transformed dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 25.360 0.999 1.0 1.0 0.040 dhash 10 25.309 0.999 0.138 0.999 0.117 dhash 32 108.960 0.990 0.0 0.336 0.872 phash 0 28.069 0.999 1.0 1.0 0.050 phash 10 28.075 0.999 0.341 0.999 0.079 phash 32 107.079 0.990 0.003 0.328 0.847 ahash 0 25.270 0.999 0.961 0.999 0.058 ahash 10 25.389 0.999 0.035 0.997 0.216 ahash 32 93.084 0.990 0.0 0.441 0.849 whash 0 40.390 0.999 0.917 0.999 0.061 whash 10 41.260 0.999 0.023 0.996 0.203 whash 32 109.630 0.990 0.0 0.410 0.853 cnn 0.5 397.380 0.999 0.003 0.852 0.999 cnn 0.9 392.090 0.999 0.999 0.990 0.384 cnn 1.0 396.250 0.990 0.0 1.0 0.0"},{"location":"user_guide/benchmarks/#observations_1","title":"Observations","text":"<ul> <li>The cnn method with threshold 0.9 seems to work best for finding transformed duplicates. A slightly lower <code>min_similarity_threshold</code> value could lead to a higher class-1 recall.</li> <li>Hashing methods do not perform well for finding transformed duplicates. In reality, resized images get found easily, but all other transformations lead to a bad performance for hashing methods.</li> </ul>"},{"location":"user_guide/benchmarks/#exact-duplicates-dataset","title":"Exact duplicates dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 18.380 1.0 1.0 1.0 1.0 dhash 10 18.410 1.0 0.223 0.999 1.0 dhash 32 34.602 1.0 0.0 0.327 1.0 phash 0 19.780 1.0 1.0 1.0 1.0 phash 10 20.012 1.0 0.980 0.999 1.0 phash 32 34.054 1.0 0.0 0.344 1.0 ahash 0 18.180 1.0 0.998 0.999 1.0 ahash 10 18.228 1.0 0.044 0.995 1.0 ahash 32 31.961 1.0 0.0 0.448 1.0 whash 0 26.097 1.0 0.980 0.999 1.0 whash 10 26.056 1.0 0.029 0.993 1.0 whash 32 39.408 1.0 0.0 0.417 1.0 cnn 0.5 192.050 1.0 0.001 0.860 1.0 cnn 0.9 191.024 1.0 1.0 1.0 1.0 cnn 1.0 194.270 0.999 1.0 1.0 0.580* <p>* The value is low as opposed to the expected 1.0 because of the <code>cosine_similarity</code>  function from scikit-learn (used within the package) which sometimes calculates the similarity to be slightly less than 1.0 even when the vectors are same.</p>"},{"location":"user_guide/benchmarks/#observations_2","title":"Observations","text":"<ul> <li>Difference hashing is the fastest (<code>max_distance_threshold</code> 0).</li> <li>When using hashing methods for exact duplicates, keep <code>max_distance_threshold</code> to a low value. The value of 0 is  good, but a slightly higher value should also work fine.</li> <li>When using cnn method, keep <code>min_similarity_threshold</code> to a high value. The default value of 0.9 seems to work well. A slightly higher value can also be used.</li> </ul>"},{"location":"user_guide/benchmarks/#summary","title":"Summary","text":"<ul> <li>Near duplicate dataset: use cnn with an appropriate <code>min_similarity_threshold</code>.</li> <li>Transformed dataset: use cnn with <code>min_similarity_threshold</code> of around 0.9 (default).</li> <li>Exact duplicates dataset: use Difference hashing with 0 <code>max_distance_threshold</code>.</li> <li>A higher <code>max_distance_threshold</code> (i.e., hashing) leads to a higher execution time. cnn method doesn't seem much affected by  the <code>min_similarity_threshold</code> (though a lower value would add a few seconds to the execution time as can be seen in all the  runs above.)</li> <li>Generally speaking, the cnn method takes longer to run as compared to hashing methods for all datasets. If a GPU is available, cnn method should be much faster.</li> </ul>"},{"location":"user_guide/custom_model/","title":"Using custom models for CNN","text":"<p>To allow users to use custom models for encoding generation, we provide a <code>CustomModel</code> construct which serves as a wrapper for a user-defined feature extractor. The <code>CustomModel</code> consists of the following attributes:</p> <ul> <li><code>name</code>: The name of the custom model. Can be set to any string.</li> <li><code>model</code>: A PyTorch model object, which is a subclass of <code>torch.nn.Module</code> and implements the <code>forward</code> method. The output of the forward method should be a tensor of shape (batch_size x features) . Alternatively, a <code>__call__</code> method is also accepted.</li> <li><code>transform</code>: A function that transforms a <code>PIL.Image</code> object into a PyTorch tensor. Should correspond to the preprocessing logic of the supplied model.</li> </ul> <p><code>CustomModel</code> is provided while initializing the <code>cnn</code> object and can be used in the following 2 scenarios:</p> <ol> <li>Using the models provided with the <code>imagededup</code> package. There are 3 models provided currently:<ul> <li><code>MobileNetV3</code> (MobileNetV3 Small)- This is the default.</li> <li><code>ViT</code> (Vision Transformer- B16 IMAGENET1K_SWAG_E2E_V1)    </li> <li><code>EfficientNet</code> (EfficientNet B4- IMAGENET1K_V1)</li> </ul> </li> </ol> <pre><code>from imagededup.methods import CNN\n\n# Get CustomModel construct\nfrom imagededup.utils import CustomModel\n\n# Get the prepackaged models from imagededup\nfrom imagededup.utils.models import ViT, MobilenetV3, EfficientNet\n\n\n# Declare a custom config with CustomModel, the prepackaged models come with a name and transform function\ncustom_config = CustomModel(name=EfficientNet.name,\n                            model=EfficientNet(), \n                            transform=EfficientNet.transform)\n\n# Use model_config argument to pass the custom config\ncnn = CNN(model_config=custom_config)\n\n# Use the model as usual\n...\n</code></pre> <p>2.Using a user-defined custom model.</p> <pre><code>from imagededup.methods import CNN\n\n# Get CustomModel construct\nfrom imagededup.utils import CustomModel\n\n# Import necessary pytorch constructs for initializing a custom feature extractor\nimport torch\nfrom torchvision.transforms import transforms\n\n# Declare custom feature extractor class\nclass MyModel(torch.nn.Module):\n    transform = transforms.Compose(\n        [\n            transforms.ToTensor()\n        ]\n    )\n    name = 'my_custom_model'\n\n    def __init__(self):\n        super().__init__()\n        # Define the layers of the model here\n\n    def forward(self, x):\n        # Do something with x\n        return x\n\ncustom_config = CustomModel(name=MyModel.name,\n                            model=MyModel(),\n                            transform=MyModel.transform)\n\ncnn = CNN(model_config=custom_config)\n\n# Use the model as usual\n...\n</code></pre> <p>It is not necessary to bundle <code>name</code> and <code>transform</code> functions with the <code>model</code> class. They can be passed separately as well.</p> <p>Examples for both scenarios can be found in the examples section.</p>"},{"location":"user_guide/encoding_generation/","title":"Encoding generation","text":"<p>It might be desirable to only generate the hashes/cnn encodings for a given image or all images in a directory instead of directly deduplicating using find_duplicates method. Encodings can be generated for a directory of images or for a  single image:</p> <ul> <li>Encoding generation for all images in a directory</li> <li>Encoding generation for a single image</li> </ul>"},{"location":"user_guide/encoding_generation/#encoding-generation-for-all-images-in-a-directory","title":"Encoding generation for all images in a directory","text":"<p>To generate encodings for all images in an image directory encode_images function can be used. The general api for  using encode_images is:</p> <pre><code>from imagededup.methods import &lt;method-name&gt;\nmethod_object = &lt;method-name&gt;()\nencodings = method_object.encode_images(image_dir='path/to/image/directory')\n</code></pre> <p>where the returned variable encodings is a dictionary mapping image file names to corresponding encoding:</p> <pre><code>{\n  'image1.jpg': &lt;encoding-image-1&gt;,\n  'image2.jpg': &lt;encoding-image-2&gt;,\n   ..\n}\n</code></pre> <p>For hashing algorithms, the encodings are 64 bit hashes represented as 16 character hexadecimal strings.</p> <p>For cnn, the encodings are numpy array with shape (576,).</p> <p>The 'method-name' corresponds to one of the deduplication methods available and can be set to:</p> <ul> <li>PHash</li> <li>AHash</li> <li>DHash</li> <li>WHash</li> <li>CNN</li> </ul>"},{"location":"user_guide/encoding_generation/#options","title":"Options","text":"<ul> <li>image_dir: Path to the image directory for which encodings are to be generated.</li> <li>recursive: finding images recursively in a nested directory structure, set to False by default.</li> </ul>"},{"location":"user_guide/encoding_generation/#considerations","title":"Considerations","text":"<ul> <li>If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, there is no entry  for the image in the returned encodings dictionary.</li> <li>Supported image formats: 'JPEG', 'PNG', 'BMP', 'MPO', 'PPM', 'TIFF', 'GIF', 'SVG', 'PGM', 'PBM', 'WEBP'.</li> </ul>"},{"location":"user_guide/encoding_generation/#examples","title":"Examples","text":"<p>Generating encodings using Difference hash:</p> <pre><code>from imagededup.methods import DHash\ndhasher = DHash()\nencodings = dhasher.encode_images(image_dir='path/to/image/directory')\n</code></pre>"},{"location":"user_guide/encoding_generation/#encoding-generation-for-a-single-image","title":"Encoding generation for a single image","text":"<p>To generate encodings for a single image encode_image function can be used. The general api for  using encode_image is:</p> <pre><code>from imagededup.methods import &lt;method-name&gt;\nmethod_object = &lt;method-name&gt;()\nencoding = method_object.encode_image(image_file='path/to/image/file')\n</code></pre> <p>where the returned variable encoding is either a hexadecimal string if a hashing method is used or a (576,) numpy  array if cnn is used.</p>"},{"location":"user_guide/encoding_generation/#options_1","title":"Options","text":"<ul> <li>image_file: Optional, path to the image file for which encodings are to be generated.</li> <li>image_array: Optional, used instead of image_file attribute. A numpy array representing the image.</li> </ul>"},{"location":"user_guide/encoding_generation/#considerations_1","title":"Considerations","text":"<ul> <li>If the image can't be loaded, no encodings are generated for the image and None is returned.</li> <li>Supported image formats: 'JPEG', 'PNG', 'BMP', 'MPO', 'PPM', 'TIFF', 'GIF', 'SVG', 'PGM', 'PBM', 'WEBP'.</li> </ul>"},{"location":"user_guide/encoding_generation/#examples_1","title":"Examples","text":"<p>Generating encodings using Difference hash:</p> <pre><code>from imagededup.methods import DHash\ndhasher = DHash()\nencoding = dhasher.encode_image(image_file='path/to/image/file')\n</code></pre>"},{"location":"user_guide/evaluating_performance/","title":"Evaluation of deduplication quality","text":"<p>To determine the quality of deduplication algorithm and the corresponding threshold, an evaluation framework is provided.</p> <p>Given a ground truth mapping consisting of file names and a list of duplicates for each file along with a retrieved  mapping from the deduplication algorithm for the same files, the following metrics can be obtained using the framework:</p> <ul> <li>Mean Average Precision (MAP)</li> <li>Mean Normalized Discounted Cumulative Gain (NDCG)</li> <li>Jaccard Index</li> <li>Per class Precision (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)</li> <li>Per class Recall (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)</li> <li>Per class f1-score (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)</li> </ul> <p>The api for obtaining these metrics  is as below:</p> <pre><code>from imagededup.evaluation import evaluate\nmetrics = evaluate(ground_truth_map, retrieved_map, metric='&lt;metric-name&gt;')\n</code></pre> <p>where the returned variable metrics is a dictionary containing the following content:</p> <pre><code>{\n  'map': &lt;map&gt;,\n  'ndcg': &lt;mean ndcg&gt;,\n  'jaccard': &lt;mean jaccard index&gt;,\n  'precision': &lt;numpy array having per class precision&gt;,\n  'recall': &lt;numpy array having per class recall&gt;,\n  'f1-score': &lt;numpy array having per class f1-score&gt;,\n  'support': &lt;numpy array having per class support&gt;\n}\n</code></pre>"},{"location":"user_guide/evaluating_performance/#options","title":"Options","text":"<ul> <li>ground_truth_map:  A dictionary representing ground truth with filenames as key and a list of duplicate filenames as  value.</li> <li>retrieved_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved duplicate  filenames as value.</li> <li>metric: Can take one of the following values:<ul> <li>'map'</li> <li>'ndcg'</li> <li>'jaccard'</li> <li>'classification': Returns per class precision, recall, f1-score, support</li> <li>'all' (default, returns all the above metrics)</li> </ul> </li> </ul>"},{"location":"user_guide/evaluating_performance/#considerations","title":"Considerations","text":"<ul> <li> <p>Presently, the ground truth map should be prepared manually by the user. Symmetric relations between duplicates must  be represented in the ground truth map. If an image i is a duplicate of image j, then j must also be represented  as a duplicate of i. Absence of symmetric relations will lead to an exception.</p> </li> <li> <p>Both the ground_truth_map and retrieved_map must have the same keys.</p> </li> <li> <p>There is a difference between the way information retrieval metrics(map, ndcg, jaccard index) and classification  metrics(precision, recall, f1-score) treat the symmetric relationships in duplicates. Consider the following  ground_truth_map and retrieved_map:</p> </li> </ul> <p>ground_truth_map:</p> <pre><code>{\n  '1.jpg': ['2.jpg', '4.jpg'],\n  '2.jpg': ['1.jpg'],\n  '3.jpg': [],\n  '4.jpg': ['1.jpg']\n}\n</code></pre> <p>retrieved_map:</p> <pre><code>{\n  '1.jpg': ['2.jpg'],\n  '2.jpg': ['1.jpg'],\n  '3.jpg': [],\n  '4.jpg': []\n}\n</code></pre> <p>From the above, it can be seen that images '1.jpg' and '4.jpg' are not found to be duplicates of each other by the  deduplication algorithm.</p> <p>For calculating information retrieval metrics, each key in the maps is considered as an independent 'query'.  In the ground truth, '4.jpg' is a duplicate of the key '1.jpg'. When it is not retrieved, it is considered a miss for  query '1.jpg'.  Similarly, '1.jpg' is a duplicate of the key '4.jpg' in the ground truth. When this is not retrieved,  it is considered a miss for query '4.jpg'.  Thus, the missing relationship is accounted for twice instead of just once.</p> <p>Classification metrics, on the other hand, consider the relationships only once by forming unique pairs of images and  labelling each pair as a 0 (non-duplicate image pair) and 1 (duplicate image pair). </p> <p>Using the ground_truth_map, the ground truth pairs with the corresponding labels are:</p> Image Pair Label ('1.jpg', '2.jpg') 1 ('1.jpg', '3.jpg') 0 ('1.jpg', '4.jpg') 1 ('2.jpg', '3.jpg') 0 ('2.jpg', '4.jpg') 0 ('3.jpg', '4.jpg') 0 <p>Similarly, using retrieved_map, the retrieved pairs are generated:</p> Image Pair Label ('1.jpg', '2.jpg') 1 ('1.jpg', '3.jpg') 0 ('1.jpg', '4.jpg') 0 ('2.jpg', '3.jpg') 0 ('2.jpg', '4.jpg') 0 ('3.jpg', '4.jpg') 0 <p>These two sets of pairs are then used to calculate metrics such as precision/recall/f1-score. It can be seen that the  missing relationship between pair ('1jpg', '4.jpg') is accounted for only once.</p>"},{"location":"user_guide/finding_duplicates/","title":"Finding duplicates","text":"<p>There are two methods available to find duplicates:</p> <ul> <li>find_duplicates()</li> <li>find_duplicates_to_remove()</li> </ul>"},{"location":"user_guide/finding_duplicates/#find_duplicates","title":"find_duplicates()","text":"<p>To find duplicates in an image directory, the general api is:</p> <pre><code>from imagededup.methods import &lt;method-name&gt;\nmethod_object = &lt;method-name&gt;()\nduplicates = method_object.find_duplicates(image_dir='path/to/image/directory',\n                                           &lt;threshold-parameter-value&gt;)\n</code></pre> <p>Duplicates can also be found if encodings of the images are available:</p> <pre><code>from imagededup.methods import &lt;method-name&gt;\nmethod_object = &lt;method-name&gt;()\nduplicates = method_object.find_duplicates(encoding_map,\n                                           &lt;threshold-parameter-value&gt;)\n</code></pre> <p>where the returned variable duplicates is a dictionary with the following content:</p> <pre><code>{\n  'image1.jpg': ['image1_duplicate1.jpg',\n                'image1_duplicate2.jpg'],\n  'image2.jpg': [..],\n  ..\n}\n</code></pre> <p>Each key in the duplicates dictionary corresponds to a file in the image directory passed to the image_dir parameter of the find_duplicates function. The value is a list of all file names in the image directory that were found to be  duplicates for the key file. The 'method-name' corresponds to one of the deduplication methods available and can be set to:</p> <ul> <li>PHash</li> <li>AHash</li> <li>DHash</li> <li>WHash</li> <li>CNN</li> </ul>"},{"location":"user_guide/finding_duplicates/#options","title":"Options","text":"<ul> <li> <p>image_dir: Optional, directory where all image files are present.</p> </li> <li> <p>encoding_map: Optional, used instead of image_dir attribute. Set it equal to the dictionary of file names and  corresponding encodings (hashes/cnn encodings). The mentioned dictionary can be generated using the corresponding  encode_images method.</p> </li> <li>scores: Setting it to True returns the scores representing the hamming distance (for hashing) or cosine similarity  (for cnn) of each of the duplicate file names from the key file. In this case, the returned 'duplicates' dictionary has   the following content:</li> </ul> <pre><code>{\n  'image1.jpg': [('image1_duplicate1.jpg', score),\n                 ('image1_duplicate2.jpg', score)],\n  'image2.jpg': [..],\n  ..\n}\n</code></pre> <p>Each key in the duplicates dictionary corresponds to a file in the image directory passed to the image_dir parameter  of the find_duplicates function. The value is a list of tuples representing the file names and corresponding scores  in the image directory that were found to be duplicates of the key file.</p> <ul> <li> <p>outfile: Name of file to which the returned duplicates dictionary is to be written, must be a json. None by default.</p> </li> <li> <p>threshold parameter:</p> <ul> <li> <p>min_similarity_threshold for cnn method indicating the minimum amount of cosine similarity that should exist    between the key image and a candidate image so that the candidate image can be considered as a duplicate of the key    image. Should be a float between -1.0 and 1.0. Default value is 0.9.</p> </li> <li> <p>max_distance_threshold for hashing methods indicating the maximum amount of hamming distance that can exist    between the key image and a candidate image so that the candidate image can be considered as a duplicate of the key    image. Should be an int between 0 and 64. Default value is 10.</p> </li> </ul> </li> <li> <p>recursive: finding images recursively in a nested directory structure, set to False by default.</p> </li> </ul>"},{"location":"user_guide/finding_duplicates/#considerations","title":"Considerations","text":"<ul> <li>The returned duplicates dictionary contains symmetric relationships i.e., if an image i is a duplicate of image j,  then image j must also be a duplicate of image i. Let's say that the image directory only consists of images i   and j, then the duplicates dictionary would have the following content:</li> </ul> <pre><code>{\n  'i': ['j'],\n  'j': ['i']\n}\n</code></pre> <ul> <li>If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, the image is  disregarded for deduplication and has no entry in the returned duplicates dictionary.</li> </ul>"},{"location":"user_guide/finding_duplicates/#examples","title":"Examples","text":"<p>To deduplicate an image directory using perceptual hashing, with a maximum allowed hamming distance of 12, scores  returned along with duplicate filenames and the returned dictionary saved to file 'my_duplicates.json', use the  following:</p> <pre><code>from imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates(image_dir='path/to/image/directory',\n                                     max_distance_threshold=12, \n                                     scores=True, \n                                     outfile='my_duplicates.json')\n</code></pre> <p>To deduplicate an image directory using cnn, with a minimum cosine similarity of 0.85, no scores returned and the  returned dictionary saved to file 'my_duplicates.json', use the following:</p> <pre><code>from imagededup.methods import CNN\ncnn_encoder = CNN()\nduplicates = cnn_encoder.find_duplicates(image_dir='path/to/image/directory', \n                                         min_similarity_threshold=0.85, \n                                         scores=False, \n                                         outfile='my_duplicates.json')\n</code></pre>"},{"location":"user_guide/finding_duplicates/#find_duplicates_to_remove","title":"find_duplicates_to_remove()","text":"<p>Returns a list of files in the image directory that are considered as duplicates. Does NOT remove the said files.</p> <p>The api is similar to find_duplicates function (except the score attribute in find_duplicates). This function  allows the return of a single list of file names in directory that are found to be duplicates. The general api for the method is as below:</p> <pre><code>from imagededup.methods import &lt;method-name&gt;\nmethod_object = &lt;method-name&gt;()\nduplicates = method_object.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                                     &lt;threshold-parameter-value&gt;)\nOR\n\nduplicates = method_object.find_duplicates_to_remove(encoding_map=encoding_map, \n                                                     &lt;threshold-parameter-value&gt;)\n</code></pre> <p>In this case, the returned variable duplicates is a list containing the name of image files that are found to be  duplicates of some file in the directory:</p> <pre><code>[\n  'image1_duplicate1.jpg',\n  'image1_duplicate2.jpg'\n  ,..\n]\n</code></pre> <p>The 'method-name' corresponds to one of the deduplication methods available and can be set to:</p> <ul> <li>PHash</li> <li>AHash</li> <li>DHash</li> <li>WHash</li> <li>CNN</li> </ul>"},{"location":"user_guide/finding_duplicates/#options_1","title":"Options","text":"<ul> <li> <p>image_dir: Optional, directory where all image files are present.</p> </li> <li> <p>encoding_map: Optional, used instead of image_dir attribute. Set it equal to the dictionary of file names and  corresponding encodings (hashes/cnn encodings). The mentioned dictionary can be generated using the corresponding  encode_images method.</p> </li> <li> <p>outfile: Name of file to which the returned duplicates dictionary is to be written, must be a json. None by default.</p> </li> <li> <p>threshold parameter:</p> <ul> <li> <p>min_similarity_threshold for cnn method indicating the minimum amount of cosine similarity that should exist    between the key image and a candidate image so that the candidate image can be considered as a duplicate for the key    image. Should be a float between -1.0 and 1.0. Default value is 0.9.</p> </li> <li> <p>max_distance_threshold for hashing methods indicating the maximum amount of hamming distance that can exist    between the key image and a candidate image so that the candidate image can be considered as a duplicate for the key    image. Should be an int between 0 and 64. Default value is 10.</p> </li> </ul> </li> <li> <p>recursive: finding images recursively in a nested directory structure, set to False by default.</p> </li> </ul>"},{"location":"user_guide/finding_duplicates/#considerations_1","title":"Considerations","text":"<ul> <li>This method must be used with caution. The symmetric nature of duplicates imposes an issue of marking one image as  duplicate and the other as original. Consider the following duplicates dictionary:</li> </ul> <pre><code>{\n  '1.jpg': ['2.jpg'],\n  '2.jpg': ['1.jpg', '3.jpg'],\n  '3.jpg': ['2.jpg']\n}\n</code></pre> <p>In this case, it is possible to remove only 2.jpg which leaves 1.jpg and 3.jpg as non-duplicates of each other.  However, it is also possible to remove both 1.jpg and 3.jpg leaving only 2.jpg. The find_duplicates_to_remove  method can thus, return either of the outputs. In the above example, let's say that 1.jpg is retained, while its  duplicate, 2.jpg, is marked as a duplicate. Once 2.jpg is marked as duplicate, its own found duplicates would be  disregarded.  Thus, 1.jpg and 3.jpg would not be considered as duplicates. So, the final return would be:</p> <pre><code>['2.jpg']\n</code></pre> <p>This leaves 1.jpg and 3.jpg as non-duplicates in the directory. If the user does not wish to impose this heuristic, it is advised to use find_duplicates function  and use a custom heuristic to mark a file as duplicate.</p> <ul> <li>If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, the image is  disregarded for deduplication and has no entry in the returned duplicates dictionary.</li> </ul>"},{"location":"user_guide/finding_duplicates/#examples_1","title":"Examples","text":"<p>To deduplicate an image directory using perceptual hashing, with a maximum allowed hamming distance of 12, and the  returned list saved to file 'my_duplicates.json', use the following:</p> <pre><code>from imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                               max_distance_threshold=12, \n                                               outfile='my_duplicates.json')\n</code></pre> <p>To deduplicate an image directory using cnn, with a minimum cosine similarity of 0.85 and the returned list saved to  file 'my_duplicates.json', use the following:</p> <pre><code>from imagededup.methods import CNN\ncnn_encoder = CNN()\nduplicates = cnn_encoder.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                                   min_similarity_threshold=0.85, \n                                                   outfile='my_duplicates.json')\n</code></pre>"},{"location":"user_guide/plotting_duplicates/","title":"Plotting duplicates of an image","text":"<p>Once a duplicate dictionary corresponding to an image directory has been obtained (using find_duplicates), duplicates  for an image can be plotted using plot_duplicates method as below:</p> <pre><code>from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir, duplicate_map, filename, outfile=None)\n</code></pre> <p>where filename is the file for which duplicates are to be plotted.</p>"},{"location":"user_guide/plotting_duplicates/#options","title":"Options","text":"<ul> <li> <p>image_dir: Directory where all image files are present.</p> </li> <li> <p>duplicate_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved duplicate  filenames as value. A duplicate_map with scores can also be passed (obtained from find_duplicates function with scores attribute set to True).</p> </li> <li> <p>filename: Image file name for which duplicates are to be plotted.</p> </li> <li> <p>outfile: Optional, name of the file the plot should be saved to. None by default.</p> </li> </ul> <p>The output looks as below:</p> <p></p>"},{"location":"utils/data_generator/","title":"Data generator","text":""},{"location":"utils/data_generator/#img_dataloader","title":"img_dataloader","text":"<pre><code>def img_dataloader(image_dir, batch_size, basenet_preprocess, recursive, num_workers)\n</code></pre>"},{"location":"utils/data_generator/#class-imgdataset","title":"class ImgDataset","text":""},{"location":"utils/data_generator/#__init__","title":"__init__","text":"<pre><code>def __init__(image_dir, basenet_preprocess, recursive)\n</code></pre>"},{"location":"utils/data_generator/#__len__","title":"__len__","text":"<pre><code>def __len__()\n</code></pre> <p>Number of images.</p>"},{"location":"utils/data_generator/#__getitem__","title":"__getitem__","text":"<pre><code>def __getitem__(item)\n</code></pre>"},{"location":"utils/general_utils/","title":"General utils","text":""},{"location":"utils/general_utils/#get_files_to_remove","title":"get_files_to_remove","text":"<pre><code>def get_files_to_remove(duplicates)\n</code></pre> <p>Get a list of files to remove.</p>"},{"location":"utils/general_utils/#args","title":"Args","text":"<ul> <li>duplicates: A dictionary with file name as key and a list of duplicate file names as value.</li> </ul>"},{"location":"utils/general_utils/#returns","title":"Returns","text":""},{"location":"utils/general_utils/#save_json","title":"save_json","text":"<pre><code>def save_json(results, filename, float_scores)\n</code></pre> <p>Save results with a filename.</p>"},{"location":"utils/general_utils/#args_1","title":"Args","text":"<ul> <li> <p>results: Dictionary of results to be saved.</p> </li> <li> <p>filename: Name of the file to be saved.</p> </li> <li> <p>float_scores: boolean to indicate if scores are floats.</p> </li> </ul>"},{"location":"utils/general_utils/#parallelise","title":"parallelise","text":"<pre><code>def parallelise(function, data, verbose, num_workers)\n</code></pre>"},{"location":"utils/general_utils/#generate_files","title":"generate_files","text":"<pre><code>def generate_files(image_dir, recursive)\n</code></pre>"},{"location":"utils/general_utils/#generate_relative_names","title":"generate_relative_names","text":"<pre><code>def generate_relative_names(image_dir, files)\n</code></pre>"},{"location":"utils/image_utils/","title":"Image utils","text":""},{"location":"utils/image_utils/#check_image_array_hash","title":"check_image_array_hash","text":"<pre><code>def check_image_array_hash(image_arr)\n</code></pre> <p>Checks the sanity of the input image numpy array for hashing functions.</p>"},{"location":"utils/image_utils/#args","title":"Args","text":"<ul> <li>image_arr: Image array.</li> </ul>"},{"location":"utils/image_utils/#expand_image_array_cnn","title":"expand_image_array_cnn","text":"<pre><code>def expand_image_array_cnn(image_arr)\n</code></pre> <p>Checks the sanity of the input image numpy array for cnn and converts the grayscale numpy array to rgb by repeating the array thrice along the 3rd dimension if a 2-dimensional image array is provided.</p>"},{"location":"utils/image_utils/#args_1","title":"Args","text":"<ul> <li>image_arr: Image array.</li> </ul>"},{"location":"utils/image_utils/#returns","title":"Returns","text":""},{"location":"utils/image_utils/#preprocess_image","title":"preprocess_image","text":"<pre><code>def preprocess_image(image, target_size, grayscale)\n</code></pre> <p>Take as input an image as numpy array or Pillow format. Returns an array version of optionally resized and grayed image.</p>"},{"location":"utils/image_utils/#args_2","title":"Args","text":"<ul> <li> <p>image: numpy array or a pillow image.</p> </li> <li> <p>target_size: Size to resize the input image to.</p> </li> <li> <p>grayscale: A boolean indicating whether to grayscale the image.</p> </li> </ul>"},{"location":"utils/image_utils/#returns_1","title":"Returns","text":""},{"location":"utils/image_utils/#load_image","title":"load_image","text":"<pre><code>def load_image(image_file, target_size, grayscale, img_formats)\n</code></pre> <p>Load an image given its path. Returns an array version of optionally resized and grayed image. Only allows images of types described by img_formats argument.</p>"},{"location":"utils/image_utils/#args_3","title":"Args","text":"<ul> <li> <p>image_file: Path to the image file.</p> </li> <li> <p>target_size: Size to resize the input image to.</p> </li> <li> <p>grayscale: A boolean indicating whether to grayscale the image.</p> </li> <li> <p>img_formats: List of allowed image formats that can be loaded.</p> </li> </ul>"},{"location":"utils/logger/","title":"Logger","text":""},{"location":"utils/logger/#return_logger","title":"return_logger","text":"<pre><code>def return_logger(name)\n</code></pre>"},{"location":"utils/models/","title":"Custom Models","text":""},{"location":"utils/models/#class-custommodel","title":"class CustomModel","text":"<p>A named tuple that can be used to initialize a custom PyTorch model.</p>"},{"location":"utils/models/#args","title":"Args","text":"<ul> <li> <p>name: The name of the custom model. Default is 'default_model'.</p> </li> <li> <p>model: The PyTorch model object which is a subclass of <code>torch.nn.Module</code> and implements the <code>forward</code> method and output a tensor of shape (batch_size x features). Alternatively, a call method is also accepted.. Default is None.</p> </li> <li> <p>transform: A function that transforms a PIL.Image object into a PyTorch tensor that will be applied to each image before being fed to the model. Should correspond to the preprocessing logic of the supplied model. Default is None.</p> </li> </ul>"},{"location":"utils/models/#class-mobilenetv3","title":"class MobilenetV3","text":""},{"location":"utils/models/#__init__","title":"__init__","text":"<pre><code>def __init__()\n</code></pre> <p>Initialize a mobilenetv3 model, cuts it at the global average pooling layer and returns the output features.</p>"},{"location":"utils/models/#forward","title":"forward","text":"<pre><code>def forward(x)\n</code></pre>"},{"location":"utils/models/#class-vit","title":"class ViT","text":""},{"location":"utils/models/#__init___1","title":"__init__","text":"<pre><code>def __init__()\n</code></pre> <p>Initialize a ViT model, takes mean of the final encoder layer outputs and returns those as features for a given image.</p>"},{"location":"utils/models/#forward_1","title":"forward","text":"<pre><code>def forward(x)\n</code></pre>"},{"location":"utils/models/#class-efficientnet","title":"class EfficientNet","text":""},{"location":"utils/models/#__init___2","title":"__init__","text":"<pre><code>def __init__()\n</code></pre> <p>Initializes an EfficientNet model, cuts it at the global average pooling layer and returns the output features.</p>"},{"location":"utils/models/#forward_2","title":"forward","text":"<pre><code>def forward(x)\n</code></pre>"},{"location":"utils/plotter/","title":"Plot duplicates","text":""},{"location":"utils/plotter/#plot_duplicates","title":"plot_duplicates","text":"<pre><code>def plot_duplicates(image_dir, duplicate_map, filename, outfile)\n</code></pre> <p>Given filename for an image, plot duplicates along with the original image using the duplicate map obtained using find_duplicates method.</p>"},{"location":"utils/plotter/#args","title":"Args","text":"<ul> <li> <p>image_dir: image directory where all files in duplicate_map are present.</p> </li> <li> <p>duplicate_map: mapping of filename to found duplicates (could be with or without scores).</p> </li> <li> <p>filename: Name of the file for which duplicates are to be plotted, must be a key in the duplicate_map.</p> </li> <li> <p>outfile: Optional, name of the file to save the plot. Default is None.</p> </li> </ul>"},{"location":"utils/plotter/#example-usage","title":"Example usage","text":"<pre><code>from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\nduplicate_map=duplicate_map,\nfilename='path/to/image.jpg')\n</code></pre>"}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index 737c88b6..89f5ebff 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,117 +2,127 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>None</loc>
-         <lastmod>2023-04-21</lastmod>
+         <lastmod>2023-04-28</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>None</loc>
+         <lastmod>2023-04-28</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>None</loc>
+         <lastmod>2023-04-28</lastmod>
          <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index c6a135d41c2bca63c213ca1da49a9e7362957bb5..84c5452f5adef7c8c58910fbce3e788607cddf6b 100644
GIT binary patch
literal 214
zcmV;{04e_;iwFp^`b%U2|8r?{Wo=<_E_iKh0PWYk4uUWc2k@PzAl>0AI-n_Kb9B-N
zAf-Z+(ju**Z*TdC;Sn6}mR|q8+;1)`-<^(Ma&XF;g6vXCVrhh}m1zocyPlJrlssb3
zK|AT=Fl<~wTJQT4z&MVnQ(kse?@~K7;6xiP4%QH8dL%rGLrn{oZZ4*+;1_EogQ?p-
zRL*y{<{8cQ2|XlP9!Az|GbY-~H1a;k2d^uoKMXd>BEpYE9C5@EM;vj)5l0+x#Qz(g
QkMQN-UwHPEsG<x20E-o3RR910

literal 213
zcmV;`04o0<iwFo@utH=4|8r?{Wo=<_E_iKh0PWVj4#FT12k^a5LD&nlI;f#^b9B-N
zXehQMEmT0OZ!i6b=_{DrEgXL?_gmrRyYJ9TIj^lLa7GwL%1B#l(-io2J*Nj;iiAI1
z=T$&)*!TjsAq*!<$8jWH2h~@DC)PF8%Qjv-T|>&q5sL)HnieeGT)eYVT&z)?PTlsQ
z@}al2$XUKi**?v)II?D&G1*q8QFo^vqONrJ;dGNMBK?to0}eRgfCCOV;D7`EANYL4
PkGJ>&ydU`&s0#o9X+>j1

diff --git a/user_guide/benchmarks/index.html b/user_guide/benchmarks/index.html
index bd5b9504..3e383bb9 100644
--- a/user_guide/benchmarks/index.html
+++ b/user_guide/benchmarks/index.html
@@ -11,13 +11,13 @@
       
       
       
-        <link rel="prev" href="../plotting_duplicates/">
+        <link rel="prev" href="../evaluating_performance/">
       
       
         <link rel="next" href="../../methods/cnn/">
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -110,6 +110,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -284,8 +286,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -310,6 +312,20 @@
               
   
   
+  
+    <li class="md-nav__item">
+      <a href="../evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
     
   
   
@@ -655,6 +671,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/user_guide/custom_model/index.html b/user_guide/custom_model/index.html
new file mode 100644
index 00000000..768ac0b7
--- /dev/null
+++ b/user_guide/custom_model/index.html
@@ -0,0 +1,798 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+      
+        <meta name="author" content="idealo Data Science Team">
+      
+      
+      
+        <link rel="prev" href="../encoding_generation/">
+      
+      
+        <link rel="next" href="../plotting_duplicates/">
+      
+      <link rel="icon" href="../../img/favicon.ico">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
+    
+    
+      
+        <title>Custom Models - Imagededup</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.ded33207.min.css">
+      
+        
+        <link rel="stylesheet" href="../../assets/stylesheets/palette.a0c5b2b5.min.css">
+      
+      
+
+    
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    
+    
+    
+    
+    
+    <body dir="ltr" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo">
+  
+    
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#using-custom-models-for-cnn" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="Imagededup" class="md-header__button md-logo" aria-label="Imagededup" data-md-component="logo">
+      
+  <img src="../../img/logo.svg" alt="logo">
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            Imagededup
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Custom Models
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+      
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/idealo/imagededup" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    idealo/imagededup
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="Imagededup" class="md-nav__button md-logo" aria-label="Imagededup" data-md-component="logo">
+      
+  <img src="../../img/logo.svg" alt="logo">
+
+    </a>
+    Imagededup
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/idealo/imagededup" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    idealo/imagededup
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+      
+
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        Home
+      </a>
+    </li>
+  
+
+    
+      
+      
+      
+
+  
+  
+    
+  
+  
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
+      
+      
+      
+        <label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
+          User Guide
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
+        <label class="md-nav__title" for="__nav_2">
+          <span class="md-nav__icon md-icon"></span>
+          User Guide
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../finding_duplicates/" class="md-nav__link">
+        Finding duplicates
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../encoding_generation/" class="md-nav__link">
+        Encoding generation
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+    
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+        
+      
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        Custom Models
+      </a>
+      
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../plotting_duplicates/" class="md-nav__link">
+        Plotting duplicates
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../benchmarks/" class="md-nav__link">
+        Benchmarks
+      </a>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+    
+      
+      
+      
+
+  
+  
+  
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
+      
+      
+      
+        <label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
+          API reference
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
+        <label class="md-nav__title" for="__nav_3">
+          <span class="md-nav__icon md-icon"></span>
+          API reference
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_1" >
+      
+      
+      
+        <label class="md-nav__link" for="__nav_3_1" id="__nav_3_1_label" tabindex="0">
+          Methods
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_1_label" aria-expanded="false">
+        <label class="md-nav__title" for="__nav_3_1">
+          <span class="md-nav__icon md-icon"></span>
+          Methods
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../methods/cnn/" class="md-nav__link">
+        CNN
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../methods/hashing/" class="md-nav__link">
+        Hashing
+      </a>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_2" >
+      
+      
+      
+        <label class="md-nav__link" for="__nav_3_2" id="__nav_3_2_label" tabindex="0">
+          Evaluation
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_2_label" aria-expanded="false">
+        <label class="md-nav__title" for="__nav_3_2">
+          <span class="md-nav__icon md-icon"></span>
+          Evaluation
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../evaluation/evaluation/" class="md-nav__link">
+        Evaluation
+      </a>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_3" >
+      
+      
+      
+        <label class="md-nav__link" for="__nav_3_3" id="__nav_3_3_label" tabindex="0">
+          Utils
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_3_label" aria-expanded="false">
+        <label class="md-nav__title" for="__nav_3_3">
+          <span class="md-nav__icon md-icon"></span>
+          Utils
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../utils/plotter/" class="md-nav__link">
+        Plot duplicates
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+    
+      
+      
+      
+
+  
+  
+  
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
+      
+      
+      
+        <label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
+          Example
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
+        <label class="md-nav__title" for="__nav_4">
+          <span class="md-nav__icon md-icon"></span>
+          Example
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../examples/CIFAR10_deduplication/" class="md-nav__link">
+        CIFAR10 deduplication
+      </a>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+    
+      
+      
+      
+
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../CONTRIBUTING/" class="md-nav__link">
+        Contribution
+      </a>
+    </li>
+  
+
+    
+      
+      
+      
+
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../LICENSE/" class="md-nav__link">
+        License
+      </a>
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+    
+  
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+<h1 id="using-custom-models-for-cnn">Using custom models for CNN</h1>
+<p>To allow users to use custom models for encoding generation, we provide a <code>CustomModel</code> construct which serves as a wrapper for a user-defined feature extractor. The <code>CustomModel</code> consists of the following attributes:</p>
+<ul>
+<li><code>name</code>: The name of the custom model. Can be set to any string.</li>
+<li><code>model</code>: A PyTorch model object, which is a subclass of <code>torch.nn.Module</code> and implements the <code>forward</code> method. The output of the forward method should be a tensor of shape (batch_size x features) . Alternatively, a <code>__call__</code> method is also accepted.</li>
+<li><code>transform</code>: A function that transforms a <code>PIL.Image</code> object into a PyTorch tensor. Should correspond to the preprocessing logic of the supplied model.</li>
+</ul>
+<p><code>CustomModel</code> is provided while initializing the <code>cnn</code> object and can be used in the following 2 scenarios:</p>
+<ol>
+<li>Using the models provided with the <code>imagededup</code> package.
+There are 3 models provided currently:<ul>
+<li><code>MobileNetV3</code> (<a href="https://pytorch.org/vision/main/models/generated/torchvision.models.mobilenet_v3_small.html#torchvision.models.mobilenet_v3_small">MobileNetV3 Small</a>)- This is the default.</li>
+<li><code>ViT</code> (<a href="https://pytorch.org/vision/main/models/generated/torchvision.models.vit_b_16.html?highlight=vit_b_16#torchvision.models.vit_b_16">Vision Transformer- B16 IMAGENET1K_SWAG_E2E_V1</a>)    </li>
+<li><code>EfficientNet</code> (<a href="https://pytorch.org/vision/main/models/generated/torchvision.models.efficientnet_b4.html?highlight=efficientnet_b4_weights#torchvision.models.EfficientNet_B4_Weights">EfficientNet B4- IMAGENET1K_V1</a>)</li>
+</ul>
+</li>
+</ol>
+<div class="codehilite"><pre><span></span><code><span class="kn">from</span> <span class="nn">imagededup.methods</span> <span class="kn">import</span> <span class="n">CNN</span>
+
+<span class="c1"># Get CustomModel construct</span>
+<span class="kn">from</span> <span class="nn">imagededup.utils</span> <span class="kn">import</span> <span class="n">CustomModel</span>
+
+<span class="c1"># Get the prepackaged models from imagededup</span>
+<span class="kn">from</span> <span class="nn">imagededup.utils.models</span> <span class="kn">import</span> <span class="n">ViT</span><span class="p">,</span> <span class="n">MobilenetV3</span><span class="p">,</span> <span class="n">EfficientNet</span>
+
+
+<span class="c1"># Declare a custom config with CustomModel, the prepackaged models come with a name and transform function</span>
+<span class="n">custom_config</span> <span class="o">=</span> <span class="n">CustomModel</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">EfficientNet</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
+                            <span class="n">model</span><span class="o">=</span><span class="n">EfficientNet</span><span class="p">(),</span> 
+                            <span class="n">transform</span><span class="o">=</span><span class="n">EfficientNet</span><span class="o">.</span><span class="n">transform</span><span class="p">)</span>
+
+<span class="c1"># Use model_config argument to pass the custom config</span>
+<span class="n">cnn</span> <span class="o">=</span> <span class="n">CNN</span><span class="p">(</span><span class="n">model_config</span><span class="o">=</span><span class="n">custom_config</span><span class="p">)</span>
+
+<span class="c1"># Use the model as usual</span>
+<span class="o">...</span>
+</code></pre></div>
+
+<p>2.Using a user-defined custom model.</p>
+<div class="codehilite"><pre><span></span><code><span class="kn">from</span> <span class="nn">imagededup.methods</span> <span class="kn">import</span> <span class="n">CNN</span>
+
+<span class="c1"># Get CustomModel construct</span>
+<span class="kn">from</span> <span class="nn">imagededup.utils</span> <span class="kn">import</span> <span class="n">CustomModel</span>
+
+<span class="c1"># Import necessary pytorch constructs for initializing a custom feature extractor</span>
+<span class="kn">import</span> <span class="nn">torch</span>
+<span class="kn">from</span> <span class="nn">torchvision.transforms</span> <span class="kn">import</span> <span class="n">transforms</span>
+
+<span class="c1"># Declare custom feature extractor class</span>
+<span class="k">class</span> <span class="nc">MyModel</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
+    <span class="n">transform</span> <span class="o">=</span> <span class="n">transforms</span><span class="o">.</span><span class="n">Compose</span><span class="p">(</span>
+        <span class="p">[</span>
+            <span class="n">transforms</span><span class="o">.</span><span class="n">ToTensor</span><span class="p">()</span>
+        <span class="p">]</span>
+    <span class="p">)</span>
+    <span class="n">name</span> <span class="o">=</span> <span class="s1">&#39;my_custom_model&#39;</span>
+
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
+        <span class="c1"># Define the layers of the model here</span>
+
+    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
+        <span class="c1"># Do something with x</span>
+        <span class="k">return</span> <span class="n">x</span>
+
+<span class="n">custom_config</span> <span class="o">=</span> <span class="n">CustomModel</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">MyModel</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
+                            <span class="n">model</span><span class="o">=</span><span class="n">MyModel</span><span class="p">(),</span>
+                            <span class="n">transform</span><span class="o">=</span><span class="n">MyModel</span><span class="o">.</span><span class="n">transform</span><span class="p">)</span>
+
+<span class="n">cnn</span> <span class="o">=</span> <span class="n">CNN</span><span class="p">(</span><span class="n">model_config</span><span class="o">=</span><span class="n">custom_config</span><span class="p">)</span>
+
+<span class="c1"># Use the model as usual</span>
+<span class="o">...</span>
+</code></pre></div>
+
+<p>It is not necessary to bundle <code>name</code> and <code>transform</code> functions with the <code>model</code> class. They can be passed separately as well.</p>
+<p>Examples for both scenarios can be found in the <a href="https://github.com/idealo/imagededup/tree/master/examples">examples section</a>.</p>
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.208ed371.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.51198bba.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/user_guide/encoding_generation/index.html b/user_guide/encoding_generation/index.html
index 369e4971..bdf0db1a 100644
--- a/user_guide/encoding_generation/index.html
+++ b/user_guide/encoding_generation/index.html
@@ -14,10 +14,10 @@
         <link rel="prev" href="../finding_duplicates/">
       
       
-        <link rel="next" href="../evaluating_performance/">
+        <link rel="next" href="../custom_model/">
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -110,6 +110,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -386,8 +388,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -413,6 +415,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../benchmarks/" class="md-nav__link">
         Benchmarks
@@ -608,6 +624,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/user_guide/evaluating_performance/index.html b/user_guide/evaluating_performance/index.html
index ff67c4f8..b1262a6f 100644
--- a/user_guide/evaluating_performance/index.html
+++ b/user_guide/evaluating_performance/index.html
@@ -11,13 +11,13 @@
       
       
       
-        <link rel="prev" href="../encoding_generation/">
+        <link rel="prev" href="../plotting_duplicates/">
       
       
-        <link rel="next" href="../plotting_duplicates/">
+        <link rel="next" href="../benchmarks/">
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -110,6 +110,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -282,6 +284,34 @@
               
   
   
+  
+    <li class="md-nav__item">
+      <a href="../custom_model/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../plotting_duplicates/" class="md-nav__link">
+        Plotting duplicates
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
     
   
   
@@ -345,20 +375,6 @@
   
   
   
-    <li class="md-nav__item">
-      <a href="../plotting_duplicates/" class="md-nav__link">
-        Plotting duplicates
-      </a>
-    </li>
-  
-
-            
-          
-            
-              
-  
-  
-  
     <li class="md-nav__item">
       <a href="../benchmarks/" class="md-nav__link">
         Benchmarks
@@ -554,6 +570,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/user_guide/finding_duplicates/index.html b/user_guide/finding_duplicates/index.html
index 9a1af6bb..672b05d7 100644
--- a/user_guide/finding_duplicates/index.html
+++ b/user_guide/finding_duplicates/index.html
@@ -17,7 +17,7 @@
         <link rel="next" href="../encoding_generation/">
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -110,6 +110,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -386,8 +388,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -413,6 +415,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../benchmarks/" class="md-nav__link">
         Benchmarks
@@ -608,6 +624,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/user_guide/plotting_duplicates/index.html b/user_guide/plotting_duplicates/index.html
index f4e83edd..660280a4 100644
--- a/user_guide/plotting_duplicates/index.html
+++ b/user_guide/plotting_duplicates/index.html
@@ -11,13 +11,13 @@
       
       
       
-        <link rel="prev" href="../evaluating_performance/">
+        <link rel="prev" href="../custom_model/">
       
       
-        <link rel="next" href="../benchmarks/">
+        <link rel="next" href="../evaluating_performance/">
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -110,6 +110,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -284,8 +286,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -352,6 +354,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../benchmarks/" class="md-nav__link">
         Benchmarks
@@ -547,6 +563,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../utils/models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/utils/data_generator/index.html b/utils/data_generator/index.html
index ea5112d2..e2367a8e 100644
--- a/utils/data_generator/index.html
+++ b/utils/data_generator/index.html
@@ -13,7 +13,7 @@
       
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -106,6 +106,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -278,8 +280,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -305,6 +307,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -500,6 +516,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
@@ -655,33 +685,6 @@
       </ul>
     </nav>
   
-</li>
-      
-        <li class="md-nav__item">
-  <a href="#class-mobilenetv3" class="md-nav__link">
-    class MobilenetV3
-  </a>
-  
-    <nav class="md-nav" aria-label="class MobilenetV3">
-      <ul class="md-nav__list">
-        
-          <li class="md-nav__item">
-  <a href="#__init___1" class="md-nav__link">
-    __init__
-  </a>
-  
-</li>
-        
-          <li class="md-nav__item">
-  <a href="#forward" class="md-nav__link">
-    forward
-  </a>
-  
-</li>
-        
-      </ul>
-    </nav>
-  
 </li>
       
     </ul>
@@ -722,15 +725,6 @@ <h3 id="__getitem__">__getitem__</h3>
 <div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="n">item</span><span class="p">)</span>
 </code></pre></div>
 
-<h2 id="class-mobilenetv3">class MobilenetV3</h2>
-<h3 id="__init___1">__init__</h3>
-<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">()</span>
-</code></pre></div>
-
-<h3 id="forward">forward</h3>
-<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
-</code></pre></div>
-
 
 
 
diff --git a/utils/general_utils/index.html b/utils/general_utils/index.html
index 8df4699f..a0022c13 100644
--- a/utils/general_utils/index.html
+++ b/utils/general_utils/index.html
@@ -13,7 +13,7 @@
       
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -106,6 +106,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -278,8 +280,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -305,6 +307,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -500,6 +516,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/utils/image_utils/index.html b/utils/image_utils/index.html
index 67410b45..61b13eb1 100644
--- a/utils/image_utils/index.html
+++ b/utils/image_utils/index.html
@@ -13,7 +13,7 @@
       
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -106,6 +106,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -278,8 +280,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -305,6 +307,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -500,6 +516,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/utils/logger/index.html b/utils/logger/index.html
index d99a0210..f7b82ae6 100644
--- a/utils/logger/index.html
+++ b/utils/logger/index.html
@@ -13,7 +13,7 @@
       
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -106,6 +106,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -278,8 +280,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -305,6 +307,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -500,6 +516,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>
diff --git a/utils/models/index.html b/utils/models/index.html
new file mode 100644
index 00000000..1151e47d
--- /dev/null
+++ b/utils/models/index.html
@@ -0,0 +1,996 @@
+
+<!doctype html>
+<html lang="en" class="no-js">
+  <head>
+    
+      <meta charset="utf-8">
+      <meta name="viewport" content="width=device-width,initial-scale=1">
+      
+      
+        <meta name="author" content="idealo Data Science Team">
+      
+      
+      
+        <link rel="prev" href="../plotter/">
+      
+      
+        <link rel="next" href="../../examples/CIFAR10_deduplication/">
+      
+      <link rel="icon" href="../../img/favicon.ico">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
+    
+    
+      
+        <title>Custom Models - Imagededup</title>
+      
+    
+    
+      <link rel="stylesheet" href="../../assets/stylesheets/main.ded33207.min.css">
+      
+        
+        <link rel="stylesheet" href="../../assets/stylesheets/palette.a0c5b2b5.min.css">
+      
+      
+
+    
+    
+    
+      
+        
+        
+        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
+        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
+      
+    
+    
+    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
+    
+      
+
+    
+    
+    
+  </head>
+  
+  
+    
+    
+    
+    
+    
+    <body dir="ltr" data-md-color-scheme="default" data-md-color-primary="indigo" data-md-color-accent="indigo">
+  
+    
+    
+    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
+    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
+    <label class="md-overlay" for="__drawer"></label>
+    <div data-md-component="skip">
+      
+        
+        <a href="#class-custommodel" class="md-skip">
+          Skip to content
+        </a>
+      
+    </div>
+    <div data-md-component="announce">
+      
+    </div>
+    
+    
+      
+
+  
+
+<header class="md-header md-header--shadow" data-md-component="header">
+  <nav class="md-header__inner md-grid" aria-label="Header">
+    <a href="../.." title="Imagededup" class="md-header__button md-logo" aria-label="Imagededup" data-md-component="logo">
+      
+  <img src="../../img/logo.svg" alt="logo">
+
+    </a>
+    <label class="md-header__button md-icon" for="__drawer">
+      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg>
+    </label>
+    <div class="md-header__title" data-md-component="header-title">
+      <div class="md-header__ellipsis">
+        <div class="md-header__topic">
+          <span class="md-ellipsis">
+            Imagededup
+          </span>
+        </div>
+        <div class="md-header__topic" data-md-component="header-topic">
+          <span class="md-ellipsis">
+            
+              Custom Models
+            
+          </span>
+        </div>
+      </div>
+    </div>
+    
+      
+    
+    
+    
+      <label class="md-header__button md-icon" for="__search">
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+      </label>
+      <div class="md-search" data-md-component="search" role="dialog">
+  <label class="md-search__overlay" for="__search"></label>
+  <div class="md-search__inner" role="search">
+    <form class="md-search__form" name="search">
+      <input type="text" class="md-search__input" name="query" aria-label="Search" placeholder="Search" autocapitalize="off" autocorrect="off" autocomplete="off" spellcheck="false" data-md-component="search-query" required>
+      <label class="md-search__icon md-icon" for="__search">
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg>
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg>
+      </label>
+      <nav class="md-search__options" aria-label="Search">
+        
+        <button type="reset" class="md-search__icon md-icon" title="Clear" aria-label="Clear" tabindex="-1">
+          <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg>
+        </button>
+      </nav>
+      
+    </form>
+    <div class="md-search__output">
+      <div class="md-search__scrollwrap" data-md-scrollfix>
+        <div class="md-search-result" data-md-component="search-result">
+          <div class="md-search-result__meta">
+            Initializing search
+          </div>
+          <ol class="md-search-result__list" role="presentation"></ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</div>
+    
+    
+      <div class="md-header__source">
+        <a href="https://github.com/idealo/imagededup" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    idealo/imagededup
+  </div>
+</a>
+      </div>
+    
+  </nav>
+  
+</header>
+    
+    <div class="md-container" data-md-component="container">
+      
+      
+        
+          
+        
+      
+      <main class="md-main" data-md-component="main">
+        <div class="md-main__inner md-grid">
+          
+            
+              
+              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+
+<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
+  <label class="md-nav__title" for="__drawer">
+    <a href="../.." title="Imagededup" class="md-nav__button md-logo" aria-label="Imagededup" data-md-component="logo">
+      
+  <img src="../../img/logo.svg" alt="logo">
+
+    </a>
+    Imagededup
+  </label>
+  
+    <div class="md-nav__source">
+      <a href="https://github.com/idealo/imagededup" title="Go to repository" class="md-source" data-md-component="source">
+  <div class="md-source__icon md-icon">
+    
+    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg>
+  </div>
+  <div class="md-source__repository">
+    idealo/imagededup
+  </div>
+</a>
+    </div>
+  
+  <ul class="md-nav__list" data-md-scrollfix>
+    
+      
+      
+      
+
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../.." class="md-nav__link">
+        Home
+      </a>
+    </li>
+  
+
+    
+      
+      
+      
+
+  
+  
+  
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
+      
+      
+      
+        <label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
+          User Guide
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
+        <label class="md-nav__title" for="__nav_2">
+          <span class="md-nav__icon md-icon"></span>
+          User Guide
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../user_guide/finding_duplicates/" class="md-nav__link">
+        Finding duplicates
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../user_guide/encoding_generation/" class="md-nav__link">
+        Encoding generation
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../user_guide/plotting_duplicates/" class="md-nav__link">
+        Plotting duplicates
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../user_guide/benchmarks/" class="md-nav__link">
+        Benchmarks
+      </a>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+    
+      
+      
+      
+
+  
+  
+    
+  
+  
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
+      
+      
+      
+        <label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
+          API reference
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
+        <label class="md-nav__title" for="__nav_3">
+          <span class="md-nav__icon md-icon"></span>
+          API reference
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_1" >
+      
+      
+      
+        <label class="md-nav__link" for="__nav_3_1" id="__nav_3_1_label" tabindex="0">
+          Methods
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_1_label" aria-expanded="false">
+        <label class="md-nav__title" for="__nav_3_1">
+          <span class="md-nav__icon md-icon"></span>
+          Methods
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../methods/cnn/" class="md-nav__link">
+        CNN
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../methods/hashing/" class="md-nav__link">
+        Hashing
+      </a>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_2" >
+      
+      
+      
+        <label class="md-nav__link" for="__nav_3_2" id="__nav_3_2_label" tabindex="0">
+          Evaluation
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_2_label" aria-expanded="false">
+        <label class="md-nav__title" for="__nav_3_2">
+          <span class="md-nav__icon md-icon"></span>
+          Evaluation
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../evaluation/evaluation/" class="md-nav__link">
+        Evaluation
+      </a>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+    
+  
+  
+    
+    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3_3" checked>
+      
+      
+      
+        <label class="md-nav__link" for="__nav_3_3" id="__nav_3_3_label" tabindex="0">
+          Utils
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="2" aria-labelledby="__nav_3_3_label" aria-expanded="true">
+        <label class="md-nav__title" for="__nav_3_3">
+          <span class="md-nav__icon md-icon"></span>
+          Utils
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../plotter/" class="md-nav__link">
+        Plot duplicates
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+    
+  
+  
+    <li class="md-nav__item md-nav__item--active">
+      
+      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
+      
+      
+      
+        <label class="md-nav__link md-nav__link--active" for="__toc">
+          Custom Models
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <a href="./" class="md-nav__link md-nav__link--active">
+        Custom Models
+      </a>
+      
+        
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#class-custommodel" class="md-nav__link">
+    class CustomModel
+  </a>
+  
+    <nav class="md-nav" aria-label="class CustomModel">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#args" class="md-nav__link">
+    Args
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#class-mobilenetv3" class="md-nav__link">
+    class MobilenetV3
+  </a>
+  
+    <nav class="md-nav" aria-label="class MobilenetV3">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#__init__" class="md-nav__link">
+    __init__
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#forward" class="md-nav__link">
+    forward
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#class-vit" class="md-nav__link">
+    class ViT
+  </a>
+  
+    <nav class="md-nav" aria-label="class ViT">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#__init___1" class="md-nav__link">
+    __init__
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#forward_1" class="md-nav__link">
+    forward
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#class-efficientnet" class="md-nav__link">
+    class EfficientNet
+  </a>
+  
+    <nav class="md-nav" aria-label="class EfficientNet">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#__init___2" class="md-nav__link">
+    __init__
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#forward_2" class="md-nav__link">
+    forward
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+      
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+    
+      
+      
+      
+
+  
+  
+  
+    
+    <li class="md-nav__item md-nav__item--nested">
+      
+      
+      
+      
+      <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
+      
+      
+      
+        <label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
+          Example
+          <span class="md-nav__icon md-icon"></span>
+        </label>
+      
+      <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
+        <label class="md-nav__title" for="__nav_4">
+          <span class="md-nav__icon md-icon"></span>
+          Example
+        </label>
+        <ul class="md-nav__list" data-md-scrollfix>
+          
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../examples/CIFAR10_deduplication/" class="md-nav__link">
+        CIFAR10 deduplication
+      </a>
+    </li>
+  
+
+            
+          
+        </ul>
+      </nav>
+    </li>
+  
+
+    
+      
+      
+      
+
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../CONTRIBUTING/" class="md-nav__link">
+        Contribution
+      </a>
+    </li>
+  
+
+    
+      
+      
+      
+
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../../LICENSE/" class="md-nav__link">
+        License
+      </a>
+    </li>
+  
+
+    
+  </ul>
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+            
+              
+              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
+                <div class="md-sidebar__scrollwrap">
+                  <div class="md-sidebar__inner">
+                    
+
+<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
+  
+  
+  
+  
+    <label class="md-nav__title" for="__toc">
+      <span class="md-nav__icon md-icon"></span>
+      Table of contents
+    </label>
+    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
+      
+        <li class="md-nav__item">
+  <a href="#class-custommodel" class="md-nav__link">
+    class CustomModel
+  </a>
+  
+    <nav class="md-nav" aria-label="class CustomModel">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#args" class="md-nav__link">
+    Args
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#class-mobilenetv3" class="md-nav__link">
+    class MobilenetV3
+  </a>
+  
+    <nav class="md-nav" aria-label="class MobilenetV3">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#__init__" class="md-nav__link">
+    __init__
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#forward" class="md-nav__link">
+    forward
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#class-vit" class="md-nav__link">
+    class ViT
+  </a>
+  
+    <nav class="md-nav" aria-label="class ViT">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#__init___1" class="md-nav__link">
+    __init__
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#forward_1" class="md-nav__link">
+    forward
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#class-efficientnet" class="md-nav__link">
+    class EfficientNet
+  </a>
+  
+    <nav class="md-nav" aria-label="class EfficientNet">
+      <ul class="md-nav__list">
+        
+          <li class="md-nav__item">
+  <a href="#__init___2" class="md-nav__link">
+    __init__
+  </a>
+  
+</li>
+        
+          <li class="md-nav__item">
+  <a href="#forward_2" class="md-nav__link">
+    forward
+  </a>
+  
+</li>
+        
+      </ul>
+    </nav>
+  
+</li>
+      
+    </ul>
+  
+</nav>
+                  </div>
+                </div>
+              </div>
+            
+          
+          
+            <div class="md-content" data-md-component="content">
+              <article class="md-content__inner md-typeset">
+                
+                  
+
+  
+  
+
+
+  <h1>Custom Models</h1>
+
+<h2 id="class-custommodel">class CustomModel</h2>
+<p>A named tuple that can be used to initialize a custom PyTorch model.</p>
+<h5 id="args">Args</h5>
+<ul>
+<li>
+<p><strong>name</strong>: The name of the custom model. Default is 'default_model'.</p>
+</li>
+<li>
+<p><strong>model</strong>: The PyTorch model object which is a subclass of <code>torch.nn.Module</code> and implements the <code>forward</code> method and output a tensor of shape (batch_size x features). Alternatively, a <strong>call</strong> method is also accepted.. Default is None.</p>
+</li>
+<li>
+<p><strong>transform</strong>: A function that transforms a PIL.Image object into a PyTorch tensor that will be applied to each image before being fed to the model. Should correspond to the preprocessing logic of the supplied model. Default is None.</p>
+</li>
+</ul>
+<h2 id="class-mobilenetv3">class MobilenetV3</h2>
+<h3 id="__init__">__init__</h3>
+<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">()</span>
+</code></pre></div>
+
+<p>Initialize a mobilenetv3 model, cuts it at the global average pooling layer and returns the output features.</p>
+<h3 id="forward">forward</h3>
+<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+</code></pre></div>
+
+<h2 id="class-vit">class ViT</h2>
+<h3 id="__init___1">__init__</h3>
+<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">()</span>
+</code></pre></div>
+
+<p>Initialize a ViT model, takes mean of the final encoder layer outputs and returns those as features for a given image.</p>
+<h3 id="forward_1">forward</h3>
+<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+</code></pre></div>
+
+<h2 id="class-efficientnet">class EfficientNet</h2>
+<h3 id="__init___2">__init__</h3>
+<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="fm">__init__</span><span class="p">()</span>
+</code></pre></div>
+
+<p>Initializes an EfficientNet model, cuts it at the global average pooling layer and returns the output features.</p>
+<h3 id="forward_2">forward</h3>
+<div class="codehilite"><pre><span></span><code><span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+</code></pre></div>
+
+
+
+
+
+                
+              </article>
+            </div>
+          
+          
+        </div>
+        
+      </main>
+      
+        <footer class="md-footer">
+  
+  <div class="md-footer-meta md-typeset">
+    <div class="md-footer-meta__inner md-grid">
+      <div class="md-copyright">
+  
+  
+    Made with
+    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
+      Material for MkDocs
+    </a>
+  
+</div>
+      
+    </div>
+  </div>
+</footer>
+      
+    </div>
+    <div class="md-dialog" data-md-component="dialog">
+      <div class="md-dialog__inner md-typeset"></div>
+    </div>
+    
+    <script id="__config" type="application/json">{"base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.208ed371.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script>
+    
+    
+      <script src="../../assets/javascripts/bundle.51198bba.min.js"></script>
+      
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/utils/plotter/index.html b/utils/plotter/index.html
index 90a5c635..0ad98544 100644
--- a/utils/plotter/index.html
+++ b/utils/plotter/index.html
@@ -14,10 +14,10 @@
         <link rel="prev" href="../../evaluation/evaluation/">
       
       
-        <link rel="next" href="../../examples/CIFAR10_deduplication/">
+        <link rel="next" href="../models/">
       
       <link rel="icon" href="../../img/favicon.ico">
-      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.6">
+      <meta name="generator" content="mkdocs-1.4.2, mkdocs-material-9.1.8">
     
     
       
@@ -110,6 +110,8 @@
       </div>
     </div>
     
+      
+    
     
     
       <label class="md-header__button md-icon" for="__search">
@@ -282,8 +284,8 @@
   
   
     <li class="md-nav__item">
-      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
-        Evaluating performance
+      <a href="../../user_guide/custom_model/" class="md-nav__link">
+        Custom Models
       </a>
     </li>
   
@@ -309,6 +311,20 @@
   
   
   
+    <li class="md-nav__item">
+      <a href="../../user_guide/evaluating_performance/" class="md-nav__link">
+        Evaluating performance
+      </a>
+    </li>
+  
+
+            
+          
+            
+              
+  
+  
+  
     <li class="md-nav__item">
       <a href="../../user_guide/benchmarks/" class="md-nav__link">
         Benchmarks
@@ -565,6 +581,20 @@
 
             
           
+            
+              
+  
+  
+  
+    <li class="md-nav__item">
+      <a href="../models/" class="md-nav__link">
+        Custom Models
+      </a>
+    </li>
+  
+
+            
+          
         </ul>
       </nav>
     </li>