From d80710081b99fea948f92a2c65012dfe42573166 Mon Sep 17 00:00:00 2001 From: "renovate[bot]" Date: Tue, 19 Dec 2023 09:38:24 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20idealo/i?= =?UTF-8?q?magededup@eded4a2301b810dc497c11fc58ddf76611ef3e15=20?= =?UTF-8?q?=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 404.html | 529 +++++++---- CONTRIBUTING/index.html | 600 +++++++++---- LICENSE/index.html | 536 +++++++---- assets/javascripts/bundle.51198bba.min.js | 29 - assets/javascripts/bundle.51198bba.min.js.map | 8 - assets/javascripts/bundle.d7c377c4.min.js | 29 + assets/javascripts/bundle.d7c377c4.min.js.map | 7 + assets/javascripts/lunr/min/lunr.el.min.js | 1 + assets/javascripts/lunr/min/lunr.he.min.js | 1 + assets/javascripts/lunr/min/lunr.hy.min.js | 1 + assets/javascripts/lunr/min/lunr.kn.min.js | 1 + assets/javascripts/lunr/min/lunr.ko.min.js | 2 +- assets/javascripts/lunr/min/lunr.sa.min.js | 1 + assets/javascripts/lunr/min/lunr.te.min.js | 1 + assets/javascripts/lunr/min/lunr.zh.min.js | 2 +- assets/javascripts/lunr/wordcut.js | 4 +- ...208ed371.min.js => search.f886a092.min.js} | 18 +- ....min.js.map => search.f886a092.min.js.map} | 9 +- assets/stylesheets/main.50c56a3b.min.css | 1 + assets/stylesheets/main.50c56a3b.min.css.map | 1 + assets/stylesheets/main.ded33207.min.css | 1 - assets/stylesheets/main.ded33207.min.css.map | 1 - assets/stylesheets/palette.06af60db.min.css | 1 + .../stylesheets/palette.06af60db.min.css.map | 1 + assets/stylesheets/palette.a0c5b2b5.min.css | 1 - .../stylesheets/palette.a0c5b2b5.min.css.map | 1 - evaluation/evaluation/index.html | 568 ++++++++---- examples/CIFAR10_deduplication/index.html | 608 +++++++++---- handlers/metrics/classification/index.html | 548 +++++++---- .../metrics/information_retrieval/index.html | 596 ++++++++---- handlers/search/bktree/index.html | 572 ++++++++---- handlers/search/brute_force/index.html | 560 ++++++++---- handlers/search/brute_force_cython/index.html | 560 ++++++++---- handlers/search/retrieval/index.html | 564 ++++++++---- index.html | 608 +++++++++---- methods/cnn/index.html | 720 ++++++++++----- methods/hashing/index.html | 848 ++++++++++++------ search/search_index.json | 2 +- sitemap.xml | 125 --- sitemap.xml.gz | Bin 214 -> 127 bytes user_guide/benchmarks/index.html | 648 ++++++++----- user_guide/custom_model/index.html | 536 +++++++---- user_guide/encoding_generation/index.html | 608 +++++++++---- user_guide/evaluating_performance/index.html | 588 ++++++++---- user_guide/finding_duplicates/index.html | 608 +++++++++---- user_guide/plotting_duplicates/index.html | 552 ++++++++---- utils/data_generator/index.html | 556 ++++++++---- utils/general_utils/index.html | 568 ++++++++---- utils/image_utils/index.html | 576 ++++++++---- utils/logger/index.html | 540 +++++++---- utils/models/index.html | 632 ++++++++----- utils/plotter/index.html | 568 ++++++++---- 52 files changed, 10446 insertions(+), 5200 deletions(-) delete mode 100644 assets/javascripts/bundle.51198bba.min.js delete mode 100644 assets/javascripts/bundle.51198bba.min.js.map create mode 100644 assets/javascripts/bundle.d7c377c4.min.js create mode 100644 assets/javascripts/bundle.d7c377c4.min.js.map create mode 100644 assets/javascripts/lunr/min/lunr.el.min.js create mode 100644 assets/javascripts/lunr/min/lunr.he.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hy.min.js create mode 100644 assets/javascripts/lunr/min/lunr.kn.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sa.min.js create mode 100644 assets/javascripts/lunr/min/lunr.te.min.js rename assets/javascripts/workers/{search.208ed371.min.js => search.f886a092.min.js} (66%) rename assets/javascripts/workers/{search.208ed371.min.js.map => search.f886a092.min.js.map} (62%) create mode 100644 assets/stylesheets/main.50c56a3b.min.css create mode 100644 assets/stylesheets/main.50c56a3b.min.css.map delete mode 100644 assets/stylesheets/main.ded33207.min.css delete mode 100644 assets/stylesheets/main.ded33207.min.css.map create mode 100644 assets/stylesheets/palette.06af60db.min.css create mode 100644 assets/stylesheets/palette.06af60db.min.css.map delete mode 100644 assets/stylesheets/palette.a0c5b2b5.min.css delete mode 100644 assets/stylesheets/palette.a0c5b2b5.min.css.map diff --git a/404.html b/404.html index 3a15440b..f79e9789 100644 --- a/404.html +++ b/404.html @@ -12,8 +12,9 @@ + - + @@ -21,15 +22,18 @@ - + - + + + + @@ -58,7 +62,6 @@ - @@ -82,6 +85,7 @@
@@ -105,7 +109,9 @@ + - + + - + diff --git a/CONTRIBUTING/index.html b/CONTRIBUTING/index.html index fcd134ca..e464ec79 100644 --- a/CONTRIBUTING/index.html +++ b/CONTRIBUTING/index.html @@ -16,8 +16,9 @@ + - + @@ -25,14 +26,17 @@ - + - + + + + @@ -62,7 +66,6 @@ - @@ -91,6 +94,7 @@
@@ -114,7 +118,9 @@ + - + + - + diff --git a/LICENSE/index.html b/LICENSE/index.html index adae5988..1ba48a79 100644 --- a/LICENSE/index.html +++ b/LICENSE/index.html @@ -14,8 +14,9 @@ + - + @@ -23,15 +24,18 @@ - + - + + + + @@ -60,7 +64,6 @@ - @@ -84,6 +87,7 @@
@@ -107,7 +111,9 @@ + - + + - + diff --git a/assets/javascripts/bundle.51198bba.min.js b/assets/javascripts/bundle.51198bba.min.js deleted file mode 100644 index 31bd0414..00000000 --- a/assets/javascripts/bundle.51198bba.min.js +++ /dev/null @@ -1,29 +0,0 @@ -"use strict";(()=>{var Ri=Object.create;var gr=Object.defineProperty;var ki=Object.getOwnPropertyDescriptor;var Hi=Object.getOwnPropertyNames,Ht=Object.getOwnPropertySymbols,Pi=Object.getPrototypeOf,yr=Object.prototype.hasOwnProperty,on=Object.prototype.propertyIsEnumerable;var nn=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,P=(e,t)=>{for(var r in t||(t={}))yr.call(t,r)&&nn(e,r,t[r]);if(Ht)for(var r of Ht(t))on.call(t,r)&&nn(e,r,t[r]);return e};var an=(e,t)=>{var r={};for(var n in e)yr.call(e,n)&&t.indexOf(n)<0&&(r[n]=e[n]);if(e!=null&&Ht)for(var n of Ht(e))t.indexOf(n)<0&&on.call(e,n)&&(r[n]=e[n]);return r};var Pt=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var $i=(e,t,r,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of Hi(t))!yr.call(e,o)&&o!==r&&gr(e,o,{get:()=>t[o],enumerable:!(n=ki(t,o))||n.enumerable});return e};var yt=(e,t,r)=>(r=e!=null?Ri(Pi(e)):{},$i(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var cn=Pt((xr,sn)=>{(function(e,t){typeof xr=="object"&&typeof sn!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(xr,function(){"use strict";function e(r){var n=!0,o=!1,i=null,s={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function a(T){return!!(T&&T!==document&&T.nodeName!=="HTML"&&T.nodeName!=="BODY"&&"classList"in T&&"contains"in T.classList)}function c(T){var Qe=T.type,De=T.tagName;return!!(De==="INPUT"&&s[Qe]&&!T.readOnly||De==="TEXTAREA"&&!T.readOnly||T.isContentEditable)}function f(T){T.classList.contains("focus-visible")||(T.classList.add("focus-visible"),T.setAttribute("data-focus-visible-added",""))}function u(T){T.hasAttribute("data-focus-visible-added")&&(T.classList.remove("focus-visible"),T.removeAttribute("data-focus-visible-added"))}function p(T){T.metaKey||T.altKey||T.ctrlKey||(a(r.activeElement)&&f(r.activeElement),n=!0)}function m(T){n=!1}function d(T){a(T.target)&&(n||c(T.target))&&f(T.target)}function h(T){a(T.target)&&(T.target.classList.contains("focus-visible")||T.target.hasAttribute("data-focus-visible-added"))&&(o=!0,window.clearTimeout(i),i=window.setTimeout(function(){o=!1},100),u(T.target))}function v(T){document.visibilityState==="hidden"&&(o&&(n=!0),G())}function G(){document.addEventListener("mousemove",N),document.addEventListener("mousedown",N),document.addEventListener("mouseup",N),document.addEventListener("pointermove",N),document.addEventListener("pointerdown",N),document.addEventListener("pointerup",N),document.addEventListener("touchmove",N),document.addEventListener("touchstart",N),document.addEventListener("touchend",N)}function oe(){document.removeEventListener("mousemove",N),document.removeEventListener("mousedown",N),document.removeEventListener("mouseup",N),document.removeEventListener("pointermove",N),document.removeEventListener("pointerdown",N),document.removeEventListener("pointerup",N),document.removeEventListener("touchmove",N),document.removeEventListener("touchstart",N),document.removeEventListener("touchend",N)}function N(T){T.target.nodeName&&T.target.nodeName.toLowerCase()==="html"||(n=!1,oe())}document.addEventListener("keydown",p,!0),document.addEventListener("mousedown",m,!0),document.addEventListener("pointerdown",m,!0),document.addEventListener("touchstart",m,!0),document.addEventListener("visibilitychange",v,!0),G(),r.addEventListener("focus",d,!0),r.addEventListener("blur",h,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var fn=Pt(Er=>{(function(e){var t=function(){try{return!!Symbol.iterator}catch(f){return!1}},r=t(),n=function(f){var u={next:function(){var p=f.shift();return{done:p===void 0,value:p}}};return r&&(u[Symbol.iterator]=function(){return u}),u},o=function(f){return encodeURIComponent(f).replace(/%20/g,"+")},i=function(f){return decodeURIComponent(String(f).replace(/\+/g," "))},s=function(){var f=function(p){Object.defineProperty(this,"_entries",{writable:!0,value:{}});var m=typeof p;if(m!=="undefined")if(m==="string")p!==""&&this._fromString(p);else if(p instanceof f){var d=this;p.forEach(function(oe,N){d.append(N,oe)})}else if(p!==null&&m==="object")if(Object.prototype.toString.call(p)==="[object Array]")for(var h=0;hd[0]?1:0}),f._entries&&(f._entries={});for(var p=0;p1?i(d[1]):"")}})})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er);(function(e){var t=function(){try{var o=new e.URL("b","http://a");return o.pathname="c d",o.href==="http://a/c%20d"&&o.searchParams}catch(i){return!1}},r=function(){var o=e.URL,i=function(c,f){typeof c!="string"&&(c=String(c)),f&&typeof f!="string"&&(f=String(f));var u=document,p;if(f&&(e.location===void 0||f!==e.location.href)){f=f.toLowerCase(),u=document.implementation.createHTMLDocument(""),p=u.createElement("base"),p.href=f,u.head.appendChild(p);try{if(p.href.indexOf(f)!==0)throw new Error(p.href)}catch(T){throw new Error("URL unable to set base "+f+" due to "+T)}}var m=u.createElement("a");m.href=c,p&&(u.body.appendChild(m),m.href=m.href);var d=u.createElement("input");if(d.type="url",d.value=c,m.protocol===":"||!/:/.test(m.href)||!d.checkValidity()&&!f)throw new TypeError("Invalid URL");Object.defineProperty(this,"_anchorElement",{value:m});var h=new e.URLSearchParams(this.search),v=!0,G=!0,oe=this;["append","delete","set"].forEach(function(T){var Qe=h[T];h[T]=function(){Qe.apply(h,arguments),v&&(G=!1,oe.search=h.toString(),G=!0)}}),Object.defineProperty(this,"searchParams",{value:h,enumerable:!0});var N=void 0;Object.defineProperty(this,"_updateSearchParams",{enumerable:!1,configurable:!1,writable:!1,value:function(){this.search!==N&&(N=this.search,G&&(v=!1,this.searchParams._fromString(this.search),v=!0))}})},s=i.prototype,a=function(c){Object.defineProperty(s,c,{get:function(){return this._anchorElement[c]},set:function(f){this._anchorElement[c]=f},enumerable:!0})};["hash","host","hostname","port","protocol"].forEach(function(c){a(c)}),Object.defineProperty(s,"search",{get:function(){return this._anchorElement.search},set:function(c){this._anchorElement.search=c,this._updateSearchParams()},enumerable:!0}),Object.defineProperties(s,{toString:{get:function(){var c=this;return function(){return c.href}}},href:{get:function(){return this._anchorElement.href.replace(/\?$/,"")},set:function(c){this._anchorElement.href=c,this._updateSearchParams()},enumerable:!0},pathname:{get:function(){return this._anchorElement.pathname.replace(/(^\/?)/,"/")},set:function(c){this._anchorElement.pathname=c},enumerable:!0},origin:{get:function(){var c={"http:":80,"https:":443,"ftp:":21}[this._anchorElement.protocol],f=this._anchorElement.port!=c&&this._anchorElement.port!=="";return this._anchorElement.protocol+"//"+this._anchorElement.hostname+(f?":"+this._anchorElement.port:"")},enumerable:!0},password:{get:function(){return""},set:function(c){},enumerable:!0},username:{get:function(){return""},set:function(c){},enumerable:!0}}),i.createObjectURL=function(c){return o.createObjectURL.apply(o,arguments)},i.revokeObjectURL=function(c){return o.revokeObjectURL.apply(o,arguments)},e.URL=i};if(t()||r(),e.location!==void 0&&!("origin"in e.location)){var n=function(){return e.location.protocol+"//"+e.location.hostname+(e.location.port?":"+e.location.port:"")};try{Object.defineProperty(e.location,"origin",{get:n,enumerable:!0})}catch(o){setInterval(function(){e.location.origin=n()},100)}}})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er)});var Kr=Pt((Mt,qr)=>{/*! - * clipboard.js v2.0.11 - * https://clipboardjs.com/ - * - * Licensed MIT © Zeno Rocha - */(function(t,r){typeof Mt=="object"&&typeof qr=="object"?qr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof Mt=="object"?Mt.ClipboardJS=r():t.ClipboardJS=r()})(Mt,function(){return function(){var e={686:function(n,o,i){"use strict";i.d(o,{default:function(){return Ci}});var s=i(279),a=i.n(s),c=i(370),f=i.n(c),u=i(817),p=i.n(u);function m(j){try{return document.execCommand(j)}catch(O){return!1}}var d=function(O){var E=p()(O);return m("cut"),E},h=d;function v(j){var O=document.documentElement.getAttribute("dir")==="rtl",E=document.createElement("textarea");E.style.fontSize="12pt",E.style.border="0",E.style.padding="0",E.style.margin="0",E.style.position="absolute",E.style[O?"right":"left"]="-9999px";var H=window.pageYOffset||document.documentElement.scrollTop;return E.style.top="".concat(H,"px"),E.setAttribute("readonly",""),E.value=j,E}var G=function(O,E){var H=v(O);E.container.appendChild(H);var I=p()(H);return m("copy"),H.remove(),I},oe=function(O){var E=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},H="";return typeof O=="string"?H=G(O,E):O instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(O==null?void 0:O.type)?H=G(O.value,E):(H=p()(O),m("copy")),H},N=oe;function T(j){return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?T=function(E){return typeof E}:T=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},T(j)}var Qe=function(){var O=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},E=O.action,H=E===void 0?"copy":E,I=O.container,q=O.target,Me=O.text;if(H!=="copy"&&H!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(q!==void 0)if(q&&T(q)==="object"&&q.nodeType===1){if(H==="copy"&&q.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(H==="cut"&&(q.hasAttribute("readonly")||q.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(Me)return N(Me,{container:I});if(q)return H==="cut"?h(q):N(q,{container:I})},De=Qe;function $e(j){return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?$e=function(E){return typeof E}:$e=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},$e(j)}function wi(j,O){if(!(j instanceof O))throw new TypeError("Cannot call a class as a function")}function rn(j,O){for(var E=0;E0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof I.action=="function"?I.action:this.defaultAction,this.target=typeof I.target=="function"?I.target:this.defaultTarget,this.text=typeof I.text=="function"?I.text:this.defaultText,this.container=$e(I.container)==="object"?I.container:document.body}},{key:"listenClick",value:function(I){var q=this;this.listener=f()(I,"click",function(Me){return q.onClick(Me)})}},{key:"onClick",value:function(I){var q=I.delegateTarget||I.currentTarget,Me=this.action(q)||"copy",kt=De({action:Me,container:this.container,target:this.target(q),text:this.text(q)});this.emit(kt?"success":"error",{action:Me,text:kt,trigger:q,clearSelection:function(){q&&q.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(I){return vr("action",I)}},{key:"defaultTarget",value:function(I){var q=vr("target",I);if(q)return document.querySelector(q)}},{key:"defaultText",value:function(I){return vr("text",I)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(I){var q=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return N(I,q)}},{key:"cut",value:function(I){return h(I)}},{key:"isSupported",value:function(){var I=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],q=typeof I=="string"?[I]:I,Me=!!document.queryCommandSupported;return q.forEach(function(kt){Me=Me&&!!document.queryCommandSupported(kt)}),Me}}]),E}(a()),Ci=Ai},828:function(n){var o=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function s(a,c){for(;a&&a.nodeType!==o;){if(typeof a.matches=="function"&&a.matches(c))return a;a=a.parentNode}}n.exports=s},438:function(n,o,i){var s=i(828);function a(u,p,m,d,h){var v=f.apply(this,arguments);return u.addEventListener(m,v,h),{destroy:function(){u.removeEventListener(m,v,h)}}}function c(u,p,m,d,h){return typeof u.addEventListener=="function"?a.apply(null,arguments):typeof m=="function"?a.bind(null,document).apply(null,arguments):(typeof u=="string"&&(u=document.querySelectorAll(u)),Array.prototype.map.call(u,function(v){return a(v,p,m,d,h)}))}function f(u,p,m,d){return function(h){h.delegateTarget=s(h.target,p),h.delegateTarget&&d.call(u,h)}}n.exports=c},879:function(n,o){o.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},o.nodeList=function(i){var s=Object.prototype.toString.call(i);return i!==void 0&&(s==="[object NodeList]"||s==="[object HTMLCollection]")&&"length"in i&&(i.length===0||o.node(i[0]))},o.string=function(i){return typeof i=="string"||i instanceof String},o.fn=function(i){var s=Object.prototype.toString.call(i);return s==="[object Function]"}},370:function(n,o,i){var s=i(879),a=i(438);function c(m,d,h){if(!m&&!d&&!h)throw new Error("Missing required arguments");if(!s.string(d))throw new TypeError("Second argument must be a String");if(!s.fn(h))throw new TypeError("Third argument must be a Function");if(s.node(m))return f(m,d,h);if(s.nodeList(m))return u(m,d,h);if(s.string(m))return p(m,d,h);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function f(m,d,h){return m.addEventListener(d,h),{destroy:function(){m.removeEventListener(d,h)}}}function u(m,d,h){return Array.prototype.forEach.call(m,function(v){v.addEventListener(d,h)}),{destroy:function(){Array.prototype.forEach.call(m,function(v){v.removeEventListener(d,h)})}}}function p(m,d,h){return a(document.body,m,d,h)}n.exports=c},817:function(n){function o(i){var s;if(i.nodeName==="SELECT")i.focus(),s=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var a=i.hasAttribute("readonly");a||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),a||i.removeAttribute("readonly"),s=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var c=window.getSelection(),f=document.createRange();f.selectNodeContents(i),c.removeAllRanges(),c.addRange(f),s=c.toString()}return s}n.exports=o},279:function(n){function o(){}o.prototype={on:function(i,s,a){var c=this.e||(this.e={});return(c[i]||(c[i]=[])).push({fn:s,ctx:a}),this},once:function(i,s,a){var c=this;function f(){c.off(i,f),s.apply(a,arguments)}return f._=s,this.on(i,f,a)},emit:function(i){var s=[].slice.call(arguments,1),a=((this.e||(this.e={}))[i]||[]).slice(),c=0,f=a.length;for(c;c{"use strict";/*! - * escape-html - * Copyright(c) 2012-2013 TJ Holowaychuk - * Copyright(c) 2015 Andreas Lubbe - * Copyright(c) 2015 Tiancheng "Timothy" Gu - * MIT Licensed - */var ns=/["'&<>]/;Go.exports=os;function os(e){var t=""+e,r=ns.exec(t);if(!r)return t;var n,o="",i=0,s=0;for(i=r.index;i0&&i[i.length-1])&&(f[0]===6||f[0]===2)){r=0;continue}if(f[0]===3&&(!i||f[1]>i[0]&&f[1]=e.length&&(e=void 0),{value:e&&e[n++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function W(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var n=r.call(e),o,i=[],s;try{for(;(t===void 0||t-- >0)&&!(o=n.next()).done;)i.push(o.value)}catch(a){s={error:a}}finally{try{o&&!o.done&&(r=n.return)&&r.call(n)}finally{if(s)throw s.error}}return i}function D(e,t,r){if(r||arguments.length===2)for(var n=0,o=t.length,i;n1||a(m,d)})})}function a(m,d){try{c(n[m](d))}catch(h){p(i[0][3],h)}}function c(m){m.value instanceof et?Promise.resolve(m.value.v).then(f,u):p(i[0][2],m)}function f(m){a("next",m)}function u(m){a("throw",m)}function p(m,d){m(d),i.shift(),i.length&&a(i[0][0],i[0][1])}}function ln(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof Ee=="function"?Ee(e):e[Symbol.iterator](),r={},n("next"),n("throw"),n("return"),r[Symbol.asyncIterator]=function(){return this},r);function n(i){r[i]=e[i]&&function(s){return new Promise(function(a,c){s=e[i](s),o(a,c,s.done,s.value)})}}function o(i,s,a,c){Promise.resolve(c).then(function(f){i({value:f,done:a})},s)}}function C(e){return typeof e=="function"}function at(e){var t=function(n){Error.call(n),n.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var It=at(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: -`+r.map(function(n,o){return o+1+") "+n.toString()}).join(` - `):"",this.name="UnsubscriptionError",this.errors=r}});function Ve(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ie=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,n,o,i;if(!this.closed){this.closed=!0;var s=this._parentage;if(s)if(this._parentage=null,Array.isArray(s))try{for(var a=Ee(s),c=a.next();!c.done;c=a.next()){var f=c.value;f.remove(this)}}catch(v){t={error:v}}finally{try{c&&!c.done&&(r=a.return)&&r.call(a)}finally{if(t)throw t.error}}else s.remove(this);var u=this.initialTeardown;if(C(u))try{u()}catch(v){i=v instanceof It?v.errors:[v]}var p=this._finalizers;if(p){this._finalizers=null;try{for(var m=Ee(p),d=m.next();!d.done;d=m.next()){var h=d.value;try{mn(h)}catch(v){i=i!=null?i:[],v instanceof It?i=D(D([],W(i)),W(v.errors)):i.push(v)}}}catch(v){n={error:v}}finally{try{d&&!d.done&&(o=m.return)&&o.call(m)}finally{if(n)throw n.error}}}if(i)throw new It(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)mn(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Ve(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Ve(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Sr=Ie.EMPTY;function jt(e){return e instanceof Ie||e&&"closed"in e&&C(e.remove)&&C(e.add)&&C(e.unsubscribe)}function mn(e){C(e)?e():e.unsubscribe()}var Le={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var st={setTimeout:function(e,t){for(var r=[],n=2;n0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var n=this,o=this,i=o.hasError,s=o.isStopped,a=o.observers;return i||s?Sr:(this.currentObservers=null,a.push(r),new Ie(function(){n.currentObservers=null,Ve(a,r)}))},t.prototype._checkFinalizedStatuses=function(r){var n=this,o=n.hasError,i=n.thrownError,s=n.isStopped;o?r.error(i):s&&r.complete()},t.prototype.asObservable=function(){var r=new F;return r.source=this,r},t.create=function(r,n){return new En(r,n)},t}(F);var En=function(e){ie(t,e);function t(r,n){var o=e.call(this)||this;return o.destination=r,o.source=n,o}return t.prototype.next=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.next)===null||o===void 0||o.call(n,r)},t.prototype.error=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.error)===null||o===void 0||o.call(n,r)},t.prototype.complete=function(){var r,n;(n=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||n===void 0||n.call(r)},t.prototype._subscribe=function(r){var n,o;return(o=(n=this.source)===null||n===void 0?void 0:n.subscribe(r))!==null&&o!==void 0?o:Sr},t}(x);var Et={now:function(){return(Et.delegate||Date).now()},delegate:void 0};var wt=function(e){ie(t,e);function t(r,n,o){r===void 0&&(r=1/0),n===void 0&&(n=1/0),o===void 0&&(o=Et);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=n,i._timestampProvider=o,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=n===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,n),i}return t.prototype.next=function(r){var n=this,o=n.isStopped,i=n._buffer,s=n._infiniteTimeWindow,a=n._timestampProvider,c=n._windowTime;o||(i.push(r),!s&&i.push(a.now()+c)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var n=this._innerSubscribe(r),o=this,i=o._infiniteTimeWindow,s=o._buffer,a=s.slice(),c=0;c0?e.prototype.requestAsyncId.call(this,r,n,o):(r.actions.push(this),r._scheduled||(r._scheduled=ut.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,n,o){var i;if(o===void 0&&(o=0),o!=null?o>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,n,o);var s=r.actions;n!=null&&((i=s[s.length-1])===null||i===void 0?void 0:i.id)!==n&&(ut.cancelAnimationFrame(n),r._scheduled=void 0)},t}(Wt);var Tn=function(e){ie(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var n=this._scheduled;this._scheduled=void 0;var o=this.actions,i;r=r||o.shift();do if(i=r.execute(r.state,r.delay))break;while((r=o[0])&&r.id===n&&o.shift());if(this._active=!1,i){for(;(r=o[0])&&r.id===n&&o.shift();)r.unsubscribe();throw i}},t}(Dt);var Te=new Tn(Sn);var _=new F(function(e){return e.complete()});function Vt(e){return e&&C(e.schedule)}function Cr(e){return e[e.length-1]}function Ye(e){return C(Cr(e))?e.pop():void 0}function Oe(e){return Vt(Cr(e))?e.pop():void 0}function zt(e,t){return typeof Cr(e)=="number"?e.pop():t}var pt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Nt(e){return C(e==null?void 0:e.then)}function qt(e){return C(e[ft])}function Kt(e){return Symbol.asyncIterator&&C(e==null?void 0:e[Symbol.asyncIterator])}function Qt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function Ni(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var Yt=Ni();function Gt(e){return C(e==null?void 0:e[Yt])}function Bt(e){return pn(this,arguments,function(){var r,n,o,i;return $t(this,function(s){switch(s.label){case 0:r=e.getReader(),s.label=1;case 1:s.trys.push([1,,9,10]),s.label=2;case 2:return[4,et(r.read())];case 3:return n=s.sent(),o=n.value,i=n.done,i?[4,et(void 0)]:[3,5];case 4:return[2,s.sent()];case 5:return[4,et(o)];case 6:return[4,s.sent()];case 7:return s.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function Jt(e){return C(e==null?void 0:e.getReader)}function U(e){if(e instanceof F)return e;if(e!=null){if(qt(e))return qi(e);if(pt(e))return Ki(e);if(Nt(e))return Qi(e);if(Kt(e))return On(e);if(Gt(e))return Yi(e);if(Jt(e))return Gi(e)}throw Qt(e)}function qi(e){return new F(function(t){var r=e[ft]();if(C(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function Ki(e){return new F(function(t){for(var r=0;r=2;return function(n){return n.pipe(e?A(function(o,i){return e(o,i,n)}):de,ge(1),r?He(t):Vn(function(){return new Zt}))}}function zn(){for(var e=[],t=0;t=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new x}:t,n=e.resetOnError,o=n===void 0?!0:n,i=e.resetOnComplete,s=i===void 0?!0:i,a=e.resetOnRefCountZero,c=a===void 0?!0:a;return function(f){var u,p,m,d=0,h=!1,v=!1,G=function(){p==null||p.unsubscribe(),p=void 0},oe=function(){G(),u=m=void 0,h=v=!1},N=function(){var T=u;oe(),T==null||T.unsubscribe()};return y(function(T,Qe){d++,!v&&!h&&G();var De=m=m!=null?m:r();Qe.add(function(){d--,d===0&&!v&&!h&&(p=$r(N,c))}),De.subscribe(Qe),!u&&d>0&&(u=new rt({next:function($e){return De.next($e)},error:function($e){v=!0,G(),p=$r(oe,o,$e),De.error($e)},complete:function(){h=!0,G(),p=$r(oe,s),De.complete()}}),U(T).subscribe(u))})(f)}}function $r(e,t){for(var r=[],n=2;ne.next(document)),e}function K(e,t=document){return Array.from(t.querySelectorAll(e))}function z(e,t=document){let r=ce(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function ce(e,t=document){return t.querySelector(e)||void 0}function _e(){return document.activeElement instanceof HTMLElement&&document.activeElement||void 0}function tr(e){return L(b(document.body,"focusin"),b(document.body,"focusout")).pipe(ke(1),l(()=>{let t=_e();return typeof t!="undefined"?e.contains(t):!1}),V(e===_e()),B())}function Xe(e){return{x:e.offsetLeft,y:e.offsetTop}}function Qn(e){return L(b(window,"load"),b(window,"resize")).pipe(Ce(0,Te),l(()=>Xe(e)),V(Xe(e)))}function rr(e){return{x:e.scrollLeft,y:e.scrollTop}}function dt(e){return L(b(e,"scroll"),b(window,"resize")).pipe(Ce(0,Te),l(()=>rr(e)),V(rr(e)))}var Gn=function(){if(typeof Map!="undefined")return Map;function e(t,r){var n=-1;return t.some(function(o,i){return o[0]===r?(n=i,!0):!1}),n}return function(){function t(){this.__entries__=[]}return Object.defineProperty(t.prototype,"size",{get:function(){return this.__entries__.length},enumerable:!0,configurable:!0}),t.prototype.get=function(r){var n=e(this.__entries__,r),o=this.__entries__[n];return o&&o[1]},t.prototype.set=function(r,n){var o=e(this.__entries__,r);~o?this.__entries__[o][1]=n:this.__entries__.push([r,n])},t.prototype.delete=function(r){var n=this.__entries__,o=e(n,r);~o&&n.splice(o,1)},t.prototype.has=function(r){return!!~e(this.__entries__,r)},t.prototype.clear=function(){this.__entries__.splice(0)},t.prototype.forEach=function(r,n){n===void 0&&(n=null);for(var o=0,i=this.__entries__;o0},e.prototype.connect_=function(){!Dr||this.connected_||(document.addEventListener("transitionend",this.onTransitionEnd_),window.addEventListener("resize",this.refresh),ga?(this.mutationsObserver_=new MutationObserver(this.refresh),this.mutationsObserver_.observe(document,{attributes:!0,childList:!0,characterData:!0,subtree:!0})):(document.addEventListener("DOMSubtreeModified",this.refresh),this.mutationEventsAdded_=!0),this.connected_=!0)},e.prototype.disconnect_=function(){!Dr||!this.connected_||(document.removeEventListener("transitionend",this.onTransitionEnd_),window.removeEventListener("resize",this.refresh),this.mutationsObserver_&&this.mutationsObserver_.disconnect(),this.mutationEventsAdded_&&document.removeEventListener("DOMSubtreeModified",this.refresh),this.mutationsObserver_=null,this.mutationEventsAdded_=!1,this.connected_=!1)},e.prototype.onTransitionEnd_=function(t){var r=t.propertyName,n=r===void 0?"":r,o=va.some(function(i){return!!~n.indexOf(i)});o&&this.refresh()},e.getInstance=function(){return this.instance_||(this.instance_=new e),this.instance_},e.instance_=null,e}(),Bn=function(e,t){for(var r=0,n=Object.keys(t);r0},e}(),Xn=typeof WeakMap!="undefined"?new WeakMap:new Gn,Zn=function(){function e(t){if(!(this instanceof e))throw new TypeError("Cannot call a class as a function.");if(!arguments.length)throw new TypeError("1 argument required, but only 0 present.");var r=ya.getInstance(),n=new Aa(t,r,this);Xn.set(this,n)}return e}();["observe","unobserve","disconnect"].forEach(function(e){Zn.prototype[e]=function(){var t;return(t=Xn.get(this))[e].apply(t,arguments)}});var Ca=function(){return typeof nr.ResizeObserver!="undefined"?nr.ResizeObserver:Zn}(),eo=Ca;var to=new x,Ra=$(()=>k(new eo(e=>{for(let t of e)to.next(t)}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),J(1));function he(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ye(e){return Ra.pipe(S(t=>t.observe(e)),g(t=>to.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(()=>he(e)))),V(he(e)))}function bt(e){return{width:e.scrollWidth,height:e.scrollHeight}}function ar(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}var ro=new x,ka=$(()=>k(new IntersectionObserver(e=>{for(let t of e)ro.next(t)},{threshold:0}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),J(1));function sr(e){return ka.pipe(S(t=>t.observe(e)),g(t=>ro.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(({isIntersecting:r})=>r))))}function no(e,t=16){return dt(e).pipe(l(({y:r})=>{let n=he(e),o=bt(e);return r>=o.height-n.height-t}),B())}var cr={drawer:z("[data-md-toggle=drawer]"),search:z("[data-md-toggle=search]")};function oo(e){return cr[e].checked}function Ke(e,t){cr[e].checked!==t&&cr[e].click()}function Ue(e){let t=cr[e];return b(t,"change").pipe(l(()=>t.checked),V(t.checked))}function Ha(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function Pa(){return L(b(window,"compositionstart").pipe(l(()=>!0)),b(window,"compositionend").pipe(l(()=>!1))).pipe(V(!1))}function io(){let e=b(window,"keydown").pipe(A(t=>!(t.metaKey||t.ctrlKey)),l(t=>({mode:oo("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),A(({mode:t,type:r})=>{if(t==="global"){let n=_e();if(typeof n!="undefined")return!Ha(n,r)}return!0}),pe());return Pa().pipe(g(t=>t?_:e))}function le(){return new URL(location.href)}function ot(e){location.href=e.href}function ao(){return new x}function so(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)so(e,r)}function M(e,t,...r){let n=document.createElement(e);if(t)for(let o of Object.keys(t))typeof t[o]!="undefined"&&(typeof t[o]!="boolean"?n.setAttribute(o,t[o]):n.setAttribute(o,""));for(let o of r)so(n,o);return n}function fr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function co(){return location.hash.substring(1)}function Vr(e){let t=M("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function $a(e){return L(b(window,"hashchange"),e).pipe(l(co),V(co()),A(t=>t.length>0),J(1))}function fo(e){return $a(e).pipe(l(t=>ce(`[id="${t}"]`)),A(t=>typeof t!="undefined"))}function zr(e){let t=matchMedia(e);return er(r=>t.addListener(()=>r(t.matches))).pipe(V(t.matches))}function uo(){let e=matchMedia("print");return L(b(window,"beforeprint").pipe(l(()=>!0)),b(window,"afterprint").pipe(l(()=>!1))).pipe(V(e.matches))}function Nr(e,t){return e.pipe(g(r=>r?t():_))}function ur(e,t={credentials:"same-origin"}){return ue(fetch(`${e}`,t)).pipe(fe(()=>_),g(r=>r.status!==200?Tt(()=>new Error(r.statusText)):k(r)))}function We(e,t){return ur(e,t).pipe(g(r=>r.json()),J(1))}function po(e,t){let r=new DOMParser;return ur(e,t).pipe(g(n=>n.text()),l(n=>r.parseFromString(n,"text/xml")),J(1))}function pr(e){let t=M("script",{src:e});return $(()=>(document.head.appendChild(t),L(b(t,"load"),b(t,"error").pipe(g(()=>Tt(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(l(()=>{}),R(()=>document.head.removeChild(t)),ge(1))))}function lo(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function mo(){return L(b(window,"scroll",{passive:!0}),b(window,"resize",{passive:!0})).pipe(l(lo),V(lo()))}function ho(){return{width:innerWidth,height:innerHeight}}function bo(){return b(window,"resize",{passive:!0}).pipe(l(ho),V(ho()))}function vo(){return Q([mo(),bo()]).pipe(l(([e,t])=>({offset:e,size:t})),J(1))}function lr(e,{viewport$:t,header$:r}){let n=t.pipe(Z("size")),o=Q([n,r]).pipe(l(()=>Xe(e)));return Q([r,t,o]).pipe(l(([{height:i},{offset:s,size:a},{x:c,y:f}])=>({offset:{x:s.x-c,y:s.y-f+i},size:a})))}(()=>{function e(n,o){parent.postMessage(n,o||"*")}function t(...n){return n.reduce((o,i)=>o.then(()=>new Promise(s=>{let a=document.createElement("script");a.src=i,a.onload=s,document.body.appendChild(a)})),Promise.resolve())}var r=class extends EventTarget{constructor(n){super(),this.url=n,this.m=i=>{i.source===this.w&&(this.dispatchEvent(new MessageEvent("message",{data:i.data})),this.onmessage&&this.onmessage(i))},this.e=(i,s,a,c,f)=>{if(s===`${this.url}`){let u=new ErrorEvent("error",{message:i,filename:s,lineno:a,colno:c,error:f});this.dispatchEvent(u),this.onerror&&this.onerror(u)}};let o=document.createElement("iframe");o.hidden=!0,document.body.appendChild(this.iframe=o),this.w.document.open(),this.w.document.write(`
@@ -826,10 +1021,11 @@
Returns
- + + - + diff --git a/examples/CIFAR10_deduplication/index.html b/examples/CIFAR10_deduplication/index.html index c877dc7b..dbf4176d 100644 --- a/examples/CIFAR10_deduplication/index.html +++ b/examples/CIFAR10_deduplication/index.html @@ -16,8 +16,9 @@ + - + @@ -25,14 +26,17 @@ - + - + + + + @@ -62,7 +66,6 @@ - @@ -91,6 +94,7 @@
@@ -114,7 +118,9 @@ + - + + - + diff --git a/handlers/metrics/classification/index.html b/handlers/metrics/classification/index.html index 96d5378f..5aba3225 100644 --- a/handlers/metrics/classification/index.html +++ b/handlers/metrics/classification/index.html @@ -12,8 +12,9 @@ + - + @@ -21,15 +22,18 @@ - + - + + + + @@ -58,7 +62,6 @@ - @@ -87,6 +90,7 @@
@@ -110,7 +114,9 @@ + - + + - + diff --git a/handlers/metrics/information_retrieval/index.html b/handlers/metrics/information_retrieval/index.html index ec280f04..57e0c021 100644 --- a/handlers/metrics/information_retrieval/index.html +++ b/handlers/metrics/information_retrieval/index.html @@ -12,8 +12,9 @@ + - + @@ -21,15 +22,18 @@ - + - + + + + @@ -58,7 +62,6 @@ - @@ -87,6 +90,7 @@
@@ -110,7 +114,9 @@ + - + + - + diff --git a/handlers/search/bktree/index.html b/handlers/search/bktree/index.html index 8b3853d8..e879c20e 100644 --- a/handlers/search/bktree/index.html +++ b/handlers/search/bktree/index.html @@ -12,8 +12,9 @@ + - + @@ -21,15 +22,18 @@ - + - + + + + @@ -58,7 +62,6 @@ - @@ -87,6 +90,7 @@
@@ -110,7 +114,9 @@ + - + + - + diff --git a/handlers/search/brute_force/index.html b/handlers/search/brute_force/index.html index 8f32e1b7..27e73d9b 100644 --- a/handlers/search/brute_force/index.html +++ b/handlers/search/brute_force/index.html @@ -12,8 +12,9 @@ + - + @@ -21,15 +22,18 @@ - + - + + + + @@ -58,7 +62,6 @@ - @@ -87,6 +90,7 @@
@@ -110,7 +114,9 @@ + - + + - + diff --git a/handlers/search/brute_force_cython/index.html b/handlers/search/brute_force_cython/index.html index 495e1ebf..1247c9ea 100644 --- a/handlers/search/brute_force_cython/index.html +++ b/handlers/search/brute_force_cython/index.html @@ -12,8 +12,9 @@ + - + @@ -21,15 +22,18 @@ - + - + + + + @@ -58,7 +62,6 @@ - @@ -87,6 +90,7 @@
@@ -110,7 +114,9 @@ + - + + - + diff --git a/handlers/search/retrieval/index.html b/handlers/search/retrieval/index.html index cf6e955a..5e086b34 100644 --- a/handlers/search/retrieval/index.html +++ b/handlers/search/retrieval/index.html @@ -12,8 +12,9 @@ + - + @@ -21,15 +22,18 @@ - + - + + + + @@ -58,7 +62,6 @@ - @@ -87,6 +90,7 @@
@@ -110,7 +114,9 @@ + - + + - + diff --git a/index.html b/index.html index 0047879a..1e6973a7 100644 --- a/index.html +++ b/index.html @@ -14,8 +14,9 @@ + - + @@ -23,14 +24,17 @@ - + - + + + + @@ -60,7 +64,6 @@ - @@ -89,6 +92,7 @@
@@ -112,7 +116,9 @@ + - + + - + diff --git a/methods/cnn/index.html b/methods/cnn/index.html index f0adeba1..be5258dc 100644 --- a/methods/cnn/index.html +++ b/methods/cnn/index.html @@ -16,8 +16,9 @@ + - + @@ -25,14 +26,17 @@ - + - + + + + @@ -62,7 +66,6 @@ - @@ -91,6 +94,7 @@
@@ -114,7 +118,9 @@ + - + + - + diff --git a/methods/hashing/index.html b/methods/hashing/index.html index c3bdb54a..271b0afa 100644 --- a/methods/hashing/index.html +++ b/methods/hashing/index.html @@ -16,8 +16,9 @@ + - + @@ -25,14 +26,17 @@ - + - + + + + @@ -62,7 +66,6 @@ - @@ -91,6 +94,7 @@
@@ -114,7 +118,9 @@ + - + + - + diff --git a/search/search_index.json b/search/search_index.json index 48ee878b..ceb942da 100644 --- a/search/search_index.json +++ b/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Image Deduplicator (imagededup)","text":"

imagededup is a python package that simplifies the task of finding exact and near duplicates in an image collection.

This package provides functionality to make use of hashing algorithms that are particularly good at finding exact duplicates as well as convolutional neural networks which are also adept at finding near duplicates. An evaluation framework is also provided to judge the quality of deduplication for a given dataset.

Following details the functionality provided by the package:

  • Finding duplicates in a directory using one of the following algorithms:
  • Convolutional Neural Network (CNN) - Select from several prepackaged models or provide your own custom model.
  • Perceptual hashing (PHash)
  • Difference hashing (DHash)
  • Wavelet hashing (WHash)
  • Average hashing (AHash)
  • Generation of encodings for images using one of the above stated algorithms.
  • Framework to evaluate effectiveness of deduplication given a ground truth mapping.
  • Plotting duplicates found for a given image file.

Detailed documentation for the package can be found at: https://idealo.github.io/imagededup/

imagededup is compatible with Python 3.8+ and runs on Linux, MacOS X and Windows. It is distributed under the Apache 2.0 license.

"},{"location":"#contents","title":"\ud83d\udcd6 Contents","text":"
  • Installation
  • Quick Start
  • Benchmarks
  • Contribute
  • Citation
  • Maintainers
  • License
"},{"location":"#installation","title":"\u2699\ufe0f Installation","text":"

There are two ways to install imagededup:

  • Install imagededup from PyPI (recommended):
pip install imagededup\n
  • Install imagededup from the GitHub source:
git clone https://github.com/idealo/imagededup.git\ncd imagededup\npip install \"cython>=0.29\"\npython setup.py install\n
"},{"location":"#quick-start","title":"\ud83d\ude80 Quick Start","text":"

In order to find duplicates in an image directory using perceptual hashing, following workflow can be used:

  • Import perceptual hashing method
from imagededup.methods import PHash\nphasher = PHash()\n
  • Generate encodings for all images in an image directory
encodings = phasher.encode_images(image_dir='path/to/image/directory')\n
  • Find duplicates using the generated encodings
duplicates = phasher.find_duplicates(encoding_map=encodings)\n
  • Plot duplicates obtained for a given file (eg: 'ukbench00120.jpg') using the duplicates dictionary
from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\n                duplicate_map=duplicates,\n                filename='ukbench00120.jpg')\n

The output looks as below:

The complete code for the workflow is:

from imagededup.methods import PHash\nphasher = PHash()\n\n# Generate encodings for all images in an image directory\nencodings = phasher.encode_images(image_dir='path/to/image/directory')\n\n# Find duplicates using the generated encodings\nduplicates = phasher.find_duplicates(encoding_map=encodings)\n\n# plot duplicates obtained for a given file using the duplicates dictionary\nfrom imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\n                duplicate_map=duplicates,\n                filename='ukbench00120.jpg')\n

It is also possible to use your own custom models for finding duplicates using the CNN method.

For examples, refer this part of the repository.

For more detailed usage of the package functionality, refer: https://idealo.github.io/imagededup/

"},{"location":"#benchmarks","title":"\u23f3 Benchmarks","text":"

Update: Provided benchmarks are only valid upto imagededup v0.2.2. The next releases have significant changes to all methods, so the current benchmarks may not hold.

Detailed benchmarks on speed and classification metrics for different methods have been provided in the documentation. Generally speaking, following conclusions can be made:

  • CNN works best for near duplicates and datasets containing transformations.
  • All deduplication methods fare well on datasets containing exact duplicates, but Difference hashing is the fastest.
"},{"location":"#contribute","title":"\ud83e\udd1d Contribute","text":"

We welcome all kinds of contributions. See the Contribution guide for more details.

"},{"location":"#citation","title":"\ud83d\udcdd Citation","text":"

Please cite Imagededup in your publications if this is useful for your research. Here is an example BibTeX entry:

@misc{idealods2019imagededup,\ntitle={Imagededup},\nauthor={Tanuj Jain and Christopher Lennan and Zubin John and Dat Tran},\nyear={2019},\nhowpublished={\\url{https://github.com/idealo/imagededup}},\n}\n
"},{"location":"#maintainers","title":"\ud83c\udfd7 Maintainers","text":"
  • Tanuj Jain, github: tanujjain
  • Christopher Lennan, github: clennan
  • Dat Tran, github: datitran
"},{"location":"#copyright","title":"\u00a9 Copyright","text":"

See LICENSE for details.

"},{"location":"CONTRIBUTING/","title":"Contribution Guide","text":"

We welcome any contributions whether it is:

  • Submitting feedback
  • Fixing bugs
  • Or implementing a new feature.

Please read this guide before making any contributions.

"},{"location":"CONTRIBUTING/#submit-feedback","title":"Submit Feedback","text":"

The feedback should be submitted by creating an issue on GitHub issues. Select the related template (bug report, feature request, or custom) and add the corresponding labels.

"},{"location":"CONTRIBUTING/#fix-bugs","title":"Fix Bugs","text":"

You may look through the GitHub issues for bugs.

"},{"location":"CONTRIBUTING/#implement-features","title":"Implement Features","text":"

You may look through the GitHub issues for feature requests.

"},{"location":"CONTRIBUTING/#pull-requests-pr","title":"Pull Requests (PR)","text":"
  1. Fork the repository and create a new branch from the dev branch.
  2. For bug fixes, add new tests and for new features, please add changes to the documentation.
  3. Do a PR from your new branch to our dev branch of the original Imagededup repo.
"},{"location":"CONTRIBUTING/#documentation","title":"Documentation","text":"
  • Make sure any new function or class you introduce has proper docstrings.
"},{"location":"CONTRIBUTING/#testing","title":"Testing","text":"
  • We use pytest for our testing. Make sure to write tests for any new feature and/or bug fixes.
"},{"location":"CONTRIBUTING/#main-contributor-list","title":"Main Contributor List","text":"

We maintain a list of main contributors to appreciate all the contributions.

"},{"location":"LICENSE/","title":"License","text":"

Copyright 2019 idealo internet GmbH. All rights reserved.

                             Apache License\n                       Version 2.0, January 2004\n                    http://www.apache.org/licenses/\n

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

  1. Definitions.

    \"License\" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.

    \"Licensor\" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.

    \"Legal Entity\" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, \"control\" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.

    \"You\" (or \"Your\") shall mean an individual or Legal Entity exercising permissions granted by this License.

    \"Source\" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.

    \"Object\" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.

    \"Work\" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).

    \"Derivative Works\" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.

    \"Contribution\" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, \"submitted\" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as \"Not a Contribution.\"

    \"Contributor\" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.

  2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.

  3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.

  4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:

    (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and

    (b) You must cause any modified files to carry prominent notices stating that You changed the files; and

    (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and

    (d) If the Work includes a \"NOTICE\" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.

    You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.

  5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.

  6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.

  7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.

  8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.

  9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS

APPENDIX: How to apply the Apache License to your work.

  To apply the Apache License to your work, attach the following\nboilerplate notice, with the fields enclosed by brackets \"[]\"\nreplaced with your own identifying information. (Don't include\n  the brackets!)  The text should be enclosed in the appropriate\n  comment syntax for the file format. We also recommend that a\n  file or class name and description of purpose be included on the\n  same \"printed page\" as the copyright notice for easier\n  identification within third-party archives.\n

Copyright [yyyy] [name of copyright owner]

Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0\n

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

"},{"location":"evaluation/evaluation/","title":"Evaluation","text":""},{"location":"evaluation/evaluation/#evaluate","title":"evaluate","text":"
def evaluate(ground_truth_map, retrieved_map, metric)\n

Given a ground truth map and a duplicate map retrieved from a deduplication algorithm, get metrics to evaluate the effectiveness of the applied deduplication algorithm.

"},{"location":"evaluation/evaluation/#args","title":"Args","text":"
  • ground_truth_map: A dictionary representing ground truth with filenames as key and a list of duplicate filenames as value.

  • retrieved_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved duplicate filenames as value.

  • metric: Name of metric to be evaluated and returned. Accepted values are: 'map', 'ndcg', 'jaccard', 'classification', 'all'(default, returns every metric).

"},{"location":"evaluation/evaluation/#returns","title":"Returns","text":"
  • dictionary: A dictionary with metric name as key and corresponding calculated metric as the value. 'map', 'ndcg' and 'jaccard' return a single number denoting the corresponding information retrieval metric. 'classification' metrics include 'precision', 'recall' and 'f1-score' which are returned in the form of individual entries in the returned dictionary. The value for each of the classification metric is a numpy array with first entry as the score for non-duplicate file pairs(class-0) and second entry as the score for duplicate file pairs (class-1). Additionally, a support is also returned as another key with first entry denoting number of non-duplicate file pairs and second entry having duplicate file pairs.
"},{"location":"examples/CIFAR10_deduplication/","title":"CIFAR10 deduplication example","text":""},{"location":"examples/CIFAR10_deduplication/#install-imagededup-via-pypi","title":"Install imagededup via PyPI","text":"
!pip install imagededup\n
"},{"location":"examples/CIFAR10_deduplication/#download-cifar10-dataset-and-untar","title":"Download CIFAR10 dataset and untar","text":"
!wget http://pjreddie.com/media/files/cifar.tgz\n!tar xzf cifar.tgz\n
"},{"location":"examples/CIFAR10_deduplication/#create-working-directory-and-move-all-images-into-this-directory","title":"Create working directory and move all images into this directory","text":"
image_dir = 'cifar10_images'\n!mkdir $image_dir\n!cp -r '/content/cifar/train/.' $image_dir\n!cp -r '/content/cifar/test/.' $image_dir\n
"},{"location":"examples/CIFAR10_deduplication/#find-duplicates-in-the-entire-dataset-with-cnn","title":"Find duplicates in the entire dataset with CNN","text":"
from imagededup.methods import CNN\n\ncnn = CNN()\nencodings = cnn.encode_images(image_dir=image_dir)\nduplicates = cnn.find_duplicates(encoding_map=encodings)\n
"},{"location":"examples/CIFAR10_deduplication/#do-some-imports-for-plotting","title":"Do some imports for plotting","text":"
from pathlib import Path\nfrom imagededup.utils import plot_duplicates\nimport matplotlib.pyplot as plt\nplt.rcParams['figure.figsize'] = (15, 10)\n
"},{"location":"examples/CIFAR10_deduplication/#find-and-plot-duplicates-in-the-test-set-with-cnn","title":"Find and plot duplicates in the test set with CNN","text":"
# test images are stored under '/content/cifar/test'\nfilenames_test = set([i.name for i in Path('/content/cifar/test').glob('*.png')])\n\nduplicates_test = {}\nfor k, v in duplicates.items():\n  if k in filenames_test:\n    tmp = [i for i in v if i in filenames_test]\n    duplicates_test[k] = tmp\n\n# sort in descending order of duplicates\nduplicates_test = {k: v for k, v in sorted(duplicates_test.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_test, filename=list(duplicates_test.keys())[0])\n
"},{"location":"examples/CIFAR10_deduplication/#find-and-plot-duplicates-in-the-train-set-with-cnn","title":"Find and plot duplicates in the train set with CNN","text":"
# train images are stored under '/content/cifar/train'\nfilenames_train = set([i.name for i in Path('/content/cifar/train').glob('*.png')])\n\nduplicates_train = {}\nfor k, v in duplicates.items():\n  if k in filenames_train:\n    tmp = [i for i in v if i in filenames_train]\n    duplicates_train[k] = tmp\n\n\n# sort in descending order of duplicates\nduplicates_train = {k: v for k, v in sorted(duplicates_train.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_train, filename=list(duplicates_train.keys())[0])\n
"},{"location":"examples/CIFAR10_deduplication/#examples-from-test-set-with-duplicates-in-train-set","title":"Examples from test set with duplicates in train set","text":"
# keep only filenames that are in test set have duplicates in train set\nduplicates_test_train = {}\nfor k, v in duplicates.items():\n    if k in filenames_test:\n        tmp = [i for i in v if i in filenames_train]\n        duplicates_test_train[k] = tmp\n\n# sort in descending order of duplicates\nduplicates_test_train = {k: v for k, v in sorted(duplicates_test_train.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_test_train, filename=list(duplicates_test_train.keys())[0])\n
"},{"location":"handlers/metrics/classification/","title":"Classification","text":""},{"location":"handlers/metrics/classification/#classification_metrics","title":"classification_metrics","text":"
def classification_metrics(ground_truth, retrieved)\n

Given ground truth dictionary and retrieved dictionary, return per class precision, recall and f1 score. Class 1 is assigned to duplicate file pairs while class 0 is for non-duplicate file pairs.

"},{"location":"handlers/metrics/classification/#args","title":"Args","text":"
  • ground_truth: A dictionary representing ground truth with filenames as key and a list of duplicate filenames

  • retrieved: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved

"},{"location":"handlers/metrics/classification/#returns","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/","title":"Information retrieval","text":""},{"location":"handlers/metrics/information_retrieval/#avg_prec","title":"avg_prec","text":"
def avg_prec(correct_duplicates, retrieved_duplicates)\n

Get average precision(AP) for a single query given correct and retrieved file names.

"},{"location":"handlers/metrics/information_retrieval/#args","title":"Args","text":"
  • correct_duplicates: List of correct duplicates i.e., ground truth)

  • retrieved_duplicates: List of retrieved duplicates for one single query

"},{"location":"handlers/metrics/information_retrieval/#returns","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#ndcg","title":"ndcg","text":"
def ndcg(correct_duplicates, retrieved_duplicates)\n

Get Normalized discounted cumulative gain(NDCG) for a single query given correct and retrieved file names.

"},{"location":"handlers/metrics/information_retrieval/#args_1","title":"Args","text":"
  • correct_duplicates: List of correct duplicates i.e., ground truth)

  • retrieved_duplicates: List of retrieved duplicates for one single query

"},{"location":"handlers/metrics/information_retrieval/#returns_1","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#jaccard_similarity","title":"jaccard_similarity","text":"
def jaccard_similarity(correct_duplicates, retrieved_duplicates)\n

Get jaccard similarity for a single query given correct and retrieved file names.

"},{"location":"handlers/metrics/information_retrieval/#args_2","title":"Args","text":"
  • correct_duplicates: List of correct duplicates i.e., ground truth)

  • retrieved_duplicates: List of retrieved duplicates for one single query

"},{"location":"handlers/metrics/information_retrieval/#returns_2","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#mean_metric","title":"mean_metric","text":"
def mean_metric(ground_truth, retrieved, metric)\n

Get mean of specified metric.

"},{"location":"handlers/metrics/information_retrieval/#args_3","title":"Args","text":"
  • metric_func: metric function on which mean is to be calculated across all queries
"},{"location":"handlers/metrics/information_retrieval/#returns_3","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#get_all_metrics","title":"get_all_metrics","text":"
def get_all_metrics(ground_truth, retrieved)\n

Get mean of all information retrieval metrics across all queries.

"},{"location":"handlers/metrics/information_retrieval/#args_4","title":"Args","text":"
  • ground_truth: A dictionary representing ground truth with filenames as key and a list of duplicate filenames

  • retrieved: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved

"},{"location":"handlers/metrics/information_retrieval/#returns_4","title":"Returns","text":""},{"location":"handlers/search/bktree/","title":"Bktree","text":""},{"location":"handlers/search/bktree/#class-bktreenode","title":"class BkTreeNode","text":"

Class to contain the attributes of a single node in the BKTree.

"},{"location":"handlers/search/bktree/#__init__","title":"__init__","text":"
def __init__(node_name, node_value, parent_name)\n
"},{"location":"handlers/search/bktree/#class-bktree","title":"class BKTree","text":"

Class to construct and perform search using a BKTree.

"},{"location":"handlers/search/bktree/#__init___1","title":"__init__","text":"
def __init__(hash_dict, distance_function)\n

Initialize a root for the BKTree and triggers the tree construction using the dictionary for mapping file names and corresponding hashes.

"},{"location":"handlers/search/bktree/#args","title":"Args","text":"
  • hash_dict: Dictionary mapping file names to corresponding hash strings {filename: hash}

  • distance_function: A function for calculating distance between the hashes.

"},{"location":"handlers/search/bktree/#construct_tree","title":"construct_tree","text":"
def construct_tree()\n

Construct the BKTree.

"},{"location":"handlers/search/bktree/#search","title":"search","text":"
def search(query, tol)\n

Function to search the bktree given a hash of the query image.

"},{"location":"handlers/search/bktree/#args_1","title":"Args","text":"
  • query: hash string for which BKTree needs to be searched.

  • tol: distance upto which duplicate is valid.

"},{"location":"handlers/search/bktree/#returns","title":"Returns","text":"
  • List of tuples of the form [(valid_retrieval_filename1: distance), (valid_retrieval_filename2: distance)]
"},{"location":"handlers/search/brute_force/","title":"Brute force","text":""},{"location":"handlers/search/brute_force/#class-bruteforce","title":"class BruteForce","text":"

Class to perform search using a Brute force.

"},{"location":"handlers/search/brute_force/#__init__","title":"__init__","text":"
def __init__(hash_dict, distance_function)\n

Initialize a dictionary for mapping file names and corresponding hashes and a distance function to be used for getting distance between two hash strings.

"},{"location":"handlers/search/brute_force/#args","title":"Args","text":"
  • hash_dict: Dictionary mapping file names to corresponding hash strings {filename: hash}

  • distance_function: A function for calculating distance between the hashes.

"},{"location":"handlers/search/brute_force/#search","title":"search","text":"
def search(query, tol)\n

Function for searching using brute force.

"},{"location":"handlers/search/brute_force/#args_1","title":"Args","text":"
  • query: hash string for which brute force needs to work.

  • tol: distance upto which duplicate is valid.

"},{"location":"handlers/search/brute_force/#returns","title":"Returns","text":"
  • List of tuples of the form [(valid_retrieval_filename1: distance), (valid_retrieval_filename2: distance)]
"},{"location":"handlers/search/brute_force_cython/","title":"Brute force cython","text":""},{"location":"handlers/search/brute_force_cython/#class-bruteforcecython","title":"class BruteForceCython","text":"

Class to perform search using a Brute force.

"},{"location":"handlers/search/brute_force_cython/#__init__","title":"__init__","text":"
def __init__(hash_dict, distance_function)\n

Initialize a dictionary for mapping file names and corresponding hashes and a distance function to be used for getting distance between two hash strings.

"},{"location":"handlers/search/brute_force_cython/#args","title":"Args","text":"
  • hash_dict: Dictionary mapping file names to corresponding hash strings {filename: hash}

  • distance_function: A function for calculating distance between the hashes.

"},{"location":"handlers/search/brute_force_cython/#search","title":"search","text":"
def search(query, tol)\n

Function for searching using brute force.

"},{"location":"handlers/search/brute_force_cython/#args_1","title":"Args","text":"
  • query: hash string for which brute force needs to work.

  • tol: distance upto which duplicate is valid.

"},{"location":"handlers/search/brute_force_cython/#returns","title":"Returns","text":"
  • List of tuples of the form [(valid_retrieval_filename1: distance), (valid_retrieval_filename2: distance)]
"},{"location":"handlers/search/retrieval/","title":"Retrieval","text":""},{"location":"handlers/search/retrieval/#cosine_similarity_chunk","title":"cosine_similarity_chunk","text":"
def cosine_similarity_chunk(t)\n
"},{"location":"handlers/search/retrieval/#get_cosine_similarity","title":"get_cosine_similarity","text":"
def get_cosine_similarity(X, verbose, chunk_size, threshold, num_workers)\n
"},{"location":"handlers/search/retrieval/#class-hasheval","title":"class HashEval","text":""},{"location":"handlers/search/retrieval/#__init__","title":"__init__","text":"
def __init__(test, queries, distance_function, verbose, threshold, search_method, num_dist_workers)\n

Initialize a HashEval object which offers an interface to control hashing and search methods for desired dataset. Compute a map of duplicate images in the document space given certain input control parameters.

"},{"location":"handlers/search/retrieval/#retrieve_results","title":"retrieve_results","text":"
def retrieve_results(scores)\n

Return results with or without scores.

"},{"location":"handlers/search/retrieval/#args","title":"Args","text":"
  • scores: Boolean indicating whether results are to eb returned with or without scores.
"},{"location":"handlers/search/retrieval/#returns","title":"Returns","text":"
  • if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg',

  • score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}

  • if scores is False, then a dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg',

  • 'image1_duplicate2.jpg'], 'image2.jpg': ['image1_duplicate1.jpg',..], ..}

"},{"location":"methods/cnn/","title":"CNN","text":""},{"location":"methods/cnn/#class-cnn","title":"class CNN","text":"

Find duplicates using CNN and/or generate CNN encodings given a single image or a directory of images.

The module can be used for 2 purposes: Encoding generation and duplicate detection.

  • Encodings generation: To propagate an image through a Convolutional Neural Network architecture and generate encodings. The generated encodings can be used at a later time for deduplication. Using the method 'encode_image', the CNN encodings for a single image can be obtained while the 'encode_images' method can be used to get encodings for all images in a directory.

  • Duplicate detection: Find duplicates either using the encoding mapping generated previously using 'encode_images' or using a Path to the directory that contains the images that need to be deduplicated. 'find_duplicates' and 'find_duplicates_to_remove' methods are provided to accomplish these tasks.

"},{"location":"methods/cnn/#__init__","title":"__init__","text":"
def __init__(verbose, model_config)\n

Initialize a pytorch MobileNet model v3 that is sliced at the last convolutional layer. Set the batch size for pytorch dataloader to be 64 samples.

"},{"location":"methods/cnn/#args","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.

  • model_config: A CustomModel that can be used to initialize a custom PyTorch model along with the corresponding transform.

"},{"location":"methods/cnn/#apply_preprocess","title":"apply_preprocess","text":"
def apply_preprocess(im_arr)\n

Apply preprocessing function for mobilenet to images.

"},{"location":"methods/cnn/#args_1","title":"Args","text":"
  • im_arr: Image typecast to numpy array.
"},{"location":"methods/cnn/#returns","title":"Returns","text":"
  • transformed_image_tensor: Transformed images returned as a pytorch tensor.
"},{"location":"methods/cnn/#encode_image","title":"encode_image","text":"
def encode_image(image_file, image_array)\n

Generate CNN encoding for a single image.

"},{"location":"methods/cnn/#args_2","title":"Args","text":"
  • image_file: Path to the image file.

  • image_array: Optional, used instead of image_file. Image typecast to numpy array.

"},{"location":"methods/cnn/#returns_1","title":"Returns","text":"
  • encoding: Encodings for the image in the form of numpy array.
"},{"location":"methods/cnn/#example-usage","title":"Example usage","text":"
from imagededup.methods import CNN\nmyencoder = CNN()\nencoding = myencoder.encode_image(image_file='path/to/image.jpg')\nOR\nencoding = myencoder.encode_image(image_array=<numpy array of image>)\n
"},{"location":"methods/cnn/#encode_images","title":"encode_images","text":"
def encode_images(image_dir, recursive, num_enc_workers)\n

Generate CNN encodings for all images in a given directory of images. Test.

"},{"location":"methods/cnn/#args_3","title":"Args","text":"
  • image_dir: Path to the image directory.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.

"},{"location":"methods/cnn/#returns_2","title":"Returns","text":"
  • dictionary: Contains a mapping of filenames and corresponding numpy array of CNN encodings.
"},{"location":"methods/cnn/#example-usage_1","title":"Example usage","text":"
from imagededup.methods import CNN\nmyencoder = CNN()\nencoding_map = myencoder.encode_images(image_dir='path/to/image/directory')\n
"},{"location":"methods/cnn/#find_duplicates","title":"find_duplicates","text":"
def find_duplicates(image_dir, encoding_map, min_similarity_threshold, scores, outfile, recursive, num_enc_workers, num_sim_workers)\n

Find duplicates for each file. Take in path of the directory or encoding dictionary in which duplicates are to be detected above the given threshold. Return dictionary containing key as filename and value as a list of duplicate file names. Optionally, the cosine distances could be returned instead of just duplicate filenames for each query file.

"},{"location":"methods/cnn/#args_4","title":"Args","text":"
  • image_dir: Path to the directory containing all the images or dictionary with keys as file names

  • encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding CNN encodings.

  • min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9

  • scores: Optional, boolean indicating whether similarity scores are to be returned along with retrieved duplicates.

  • outfile: Optional, name of the file to save the results, must be a json. Default is None.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.

  • num_sim_workers: Optional, number of cpu cores to use for multiprocessing similarity computation, set to number of CPUs in the system by default. 0 disables multiprocessing.

"},{"location":"methods/cnn/#returns_3","title":"Returns","text":"
  • dictionary: if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg', score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}. if scores is False, then a dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg', 'image1_duplicate2.jpg'], 'image2.jpg':['image1_duplicate1.jpg',..], ..}
"},{"location":"methods/cnn/#example-usage_2","title":"Example usage","text":"
from imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates(image_dir='path/to/directory', min_similarity_threshold=0.85, scores=True,\noutfile='results.json')\n\nOR\n\nfrom imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates(encoding_map=<mapping filename to cnn encodings>,\nmin_similarity_threshold=0.85, scores=True, outfile='results.json')\n
"},{"location":"methods/cnn/#find_duplicates_to_remove","title":"find_duplicates_to_remove","text":"
def find_duplicates_to_remove(image_dir, encoding_map, min_similarity_threshold, outfile, recursive, num_enc_workers, num_sim_workers)\n

Give out a list of image file names to remove based on the similarity threshold. Does not remove the mentioned files.

"},{"location":"methods/cnn/#args_5","title":"Args","text":"
  • image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as numpy arrays which represent the CNN encoding for the key image file.

  • encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding CNN encodings.

  • min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9

  • outfile: Optional, name of the file to save the results, must be a json. Default is None.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.

  • num_sim_workers: Optional, number of cpu cores to use for multiprocessing similarity computation, set to number of CPUs in the system by default. 0 disables multiprocessing.

"},{"location":"methods/cnn/#returns_4","title":"Returns","text":"
  • duplicates: List of image file names that should be removed.
"},{"location":"methods/cnn/#example-usage_3","title":"Example usage","text":"
from imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'),\nmin_similarity_threshold=0.85)\n\nOR\n\nfrom imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates_to_remove(encoding_map=<mapping filename to cnn encodings>,\nmin_similarity_threshold=0.85, outfile='results.json')\n
"},{"location":"methods/hashing/","title":"Hashing","text":""},{"location":"methods/hashing/#class-hashing","title":"class Hashing","text":"

Find duplicates using hashing algorithms and/or generate hashes given a single image or a directory of images.

The module can be used for 2 purposes: Encoding generation and duplicate detection.

  • Encoding generation: To generate hashes using specific hashing method. The generated hashes can be used at a later time for deduplication. Using the method 'encode_image' from the specific hashing method object, the hash for a single image can be obtained while the 'encode_images' method can be used to get hashes for all images in a directory.

  • Duplicate detection: Find duplicates either using the encoding mapping generated previously using 'encode_images' or using a Path to the directory that contains the images that need to be deduplicated. 'find_duplicates' and 'find_duplicates_to_remove' methods are provided to accomplish these tasks.

"},{"location":"methods/hashing/#__init__","title":"__init__","text":"
def __init__(verbose)\n

Initialize hashing class.

"},{"location":"methods/hashing/#args","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.
"},{"location":"methods/hashing/#hamming_distance","title":"hamming_distance","text":"
def hamming_distance(hash1, hash2)\n

Calculate the hamming distance between two hashes. If length of hashes is not 64 bits, then pads the length to be 64 for each hash and then calculates the hamming distance.

"},{"location":"methods/hashing/#args_1","title":"Args","text":"
  • hash1: hash string

  • hash2: hash string

"},{"location":"methods/hashing/#returns","title":"Returns","text":"
  • hamming_distance: Hamming distance between the two hashes.
"},{"location":"methods/hashing/#encode_image","title":"encode_image","text":"
def encode_image(image_file, image_array)\n

Generate hash for a single image.

"},{"location":"methods/hashing/#args_2","title":"Args","text":"
  • image_file: Path to the image file.

  • image_array: Optional, used instead of image_file. Image typecast to numpy array.

"},{"location":"methods/hashing/#returns_1","title":"Returns","text":"
  • hash: A 16 character hexadecimal string hash for the image.
"},{"location":"methods/hashing/#example-usage","title":"Example usage","text":"
from imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nmyhash = myencoder.encode_image(image_file='path/to/image.jpg')\nOR\nmyhash = myencoder.encode_image(image_array=<numpy array of image>)\n
"},{"location":"methods/hashing/#encode_images","title":"encode_images","text":"
def encode_images(image_dir, recursive, num_enc_workers)\n

Generate hashes for all images in a given directory of images.

"},{"location":"methods/hashing/#args_3","title":"Args","text":"
  • image_dir: Path to the image directory.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.

"},{"location":"methods/hashing/#returns_2","title":"Returns","text":"
  • dictionary: A dictionary that contains a mapping of filenames and corresponding 64 character hash string such as {'Image1.jpg': 'hash_string1', 'Image2.jpg': 'hash_string2', ...}
"},{"location":"methods/hashing/#example-usage_1","title":"Example usage","text":"
from imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nmapping = myencoder.encode_images('path/to/directory')\n
"},{"location":"methods/hashing/#find_duplicates","title":"find_duplicates","text":"
def find_duplicates(image_dir, encoding_map, max_distance_threshold, scores, outfile, search_method, recursive, num_enc_workers, num_dist_workers)\n

Find duplicates for each file. Takes in path of the directory or encoding dictionary in which duplicates are to be detected. All images with hamming distance less than or equal to the max_distance_threshold are regarded as duplicates. Returns dictionary containing key as filename and value as a list of duplicate file names. Optionally, the below the given hamming distance could be returned instead of just duplicate filenames for each query file.

"},{"location":"methods/hashing/#args_4","title":"Args","text":"
  • image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as hash strings for the key image file.

  • encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding hashes.

  • max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). Default is 10.

  • scores: Optional, boolean indicating whether Hamming distances are to be returned along with retrieved duplicates.

  • outfile: Optional, name of the file to save the results, must be a json. Default is None.

  • search_method: Algorithm used to retrieve duplicates. Default is brute_force_cython for Unix else bktree.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.

  • num_dist_workers: Optional, number of cpu cores to use for multiprocessing distance computation, set to number of CPUs in the system by default. 0 disables multiprocessing.

"},{"location":"methods/hashing/#returns_3","title":"Returns","text":"
  • duplicates dictionary: if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg', score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}. if scores is False, then a dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg', 'image1_duplicate2.jpg'], 'image2.jpg':['image1_duplicate1.jpg',..], ..}
"},{"location":"methods/hashing/#example-usage_2","title":"Example usage","text":"
from imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nduplicates = myencoder.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True,\noutfile='results.json')\n\nOR\n\nfrom imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nduplicates = myencoder.find_duplicates(encoding_map=<mapping filename to hashes>,\nmax_distance_threshold=15, scores=True, outfile='results.json')\n
"},{"location":"methods/hashing/#find_duplicates_to_remove","title":"find_duplicates_to_remove","text":"
def find_duplicates_to_remove(image_dir, encoding_map, max_distance_threshold, outfile, recursive, num_enc_workers, num_dist_workers)\n

Give out a list of image file names to remove based on the hamming distance threshold threshold. Does not remove the mentioned files.

"},{"location":"methods/hashing/#args_5","title":"Args","text":"
  • image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as hash strings for the key image file.

  • encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding hashes.

  • max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). Default is 10.

  • outfile: Optional, name of the file to save the results, must be a json. Default is None.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.

  • num_dist_workers: Optional, number of cpu cores to use for multiprocessing distance computation, set to number of CPUs in the system by default. 0 disables multiprocessing.

"},{"location":"methods/hashing/#returns_4","title":"Returns","text":"
  • duplicates: List of image file names that are found to be duplicate of me other file in the directory.
"},{"location":"methods/hashing/#example-usage_3","title":"Example usage","text":"
from imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nduplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'),\nmax_distance_threshold=15)\n\nOR\n\nfrom imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nduplicates = myencoder.find_duplicates(encoding_map=<mapping filename to hashes>,\nmax_distance_threshold=15, outfile='results.json')\n
"},{"location":"methods/hashing/#class-phash","title":"class PHash","text":"

Inherits from Hashing base class and implements perceptual hashing (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html).

Offers all the functionality mentioned in hashing class.

"},{"location":"methods/hashing/#example-usage_4","title":"Example usage","text":"
# Perceptual hash for images\nfrom imagededup.methods import PHash\nphasher = PHash()\nperceptual_hash = phasher.encode_image(image_file = 'path/to/image.jpg')\nOR\nperceptual_hash = phasher.encode_image(image_array = <numpy image array>)\nOR\nperceptual_hashes = phasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = phasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import PHash\nphasher = PHash()\nfiles_to_remove = phasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = phasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n
"},{"location":"methods/hashing/#__init___1","title":"__init__","text":"
def __init__(verbose)\n

Initialize perceptual hashing class.

"},{"location":"methods/hashing/#args_6","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.
"},{"location":"methods/hashing/#class-ahash","title":"class AHash","text":"

Inherits from Hashing base class and implements average hashing. (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)

Offers all the functionality mentioned in hashing class.

"},{"location":"methods/hashing/#example-usage_5","title":"Example usage","text":"
# Average hash for images\nfrom imagededup.methods import AHash\nahasher = AHash()\naverage_hash = ahasher.encode_image(image_file = 'path/to/image.jpg')\nOR\naverage_hash = ahasher.encode_image(image_array = <numpy image array>)\nOR\naverage_hashes = ahasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import AHash\nahasher = AHash()\nduplicates = ahasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = ahasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import AHash\nahasher = AHash()\nfiles_to_remove = ahasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = ahasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n
"},{"location":"methods/hashing/#__init___2","title":"__init__","text":"
def __init__(verbose)\n

Initialize average hashing class.

"},{"location":"methods/hashing/#args_7","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.
"},{"location":"methods/hashing/#class-dhash","title":"class DHash","text":"

Inherits from Hashing base class and implements difference hashing. (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)

Offers all the functionality mentioned in hashing class.

"},{"location":"methods/hashing/#example-usage_6","title":"Example usage","text":"
# Difference hash for images\nfrom imagededup.methods import DHash\ndhasher = DHash()\ndifference_hash = dhasher.encode_image(image_file = 'path/to/image.jpg')\nOR\ndifference_hash = dhasher.encode_image(image_array = <numpy image array>)\nOR\ndifference_hashes = dhasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import DHash\ndhasher = DHash()\nduplicates = dhasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = dhasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import DHash\ndhasher = DHash()\nfiles_to_remove = dhasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = dhasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n
"},{"location":"methods/hashing/#__init___3","title":"__init__","text":"
def __init__(verbose)\n

Initialize difference hashing class.

"},{"location":"methods/hashing/#args_8","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.
"},{"location":"methods/hashing/#class-whash","title":"class WHash","text":"

Inherits from Hashing base class and implements wavelet hashing. (Implementation reference: https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5)

Offers all the functionality mentioned in hashing class.

"},{"location":"methods/hashing/#example-usage_7","title":"Example usage","text":"
# Wavelet hash for images\nfrom imagededup.methods import WHash\nwhasher = WHash()\nwavelet_hash = whasher.encode_image(image_file = 'path/to/image.jpg')\nOR\nwavelet_hash = whasher.encode_image(image_array = <numpy image array>)\nOR\nwavelet_hashes = whasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import WHash\nwhasher = WHash()\nduplicates = whasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = whasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import WHash\nwhasher = WHash()\nfiles_to_remove = whasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = whasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n
"},{"location":"methods/hashing/#__init___4","title":"__init__","text":"
def __init__(verbose)\n

Initialize wavelet hashing class.

"},{"location":"methods/hashing/#args_9","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.
"},{"location":"user_guide/benchmarks/","title":"Benchmarks","text":"

To gauge an idea of the speed and accuracy of the implemented algorithms, a benchmark has been provided on the UKBench dataset (zip file titled 'UKBench image collection' having size ~1.5G) and some variations derived from it.

"},{"location":"user_guide/benchmarks/#datasets","title":"Datasets","text":"

3 datasets that have been used:

  1. Near duplicate dataset (UKBench dataset): This dataset has near duplicates that are arranged in groups of 4. There are a total of 2550 such groups amounting to a total of 10200 RGB images. The size of each image is 640 x 480 with jpg extension. The image below depicts 3 example groups from the UKBench dataset. Each row represents a group with the corresponding 4 images from the group.

  2. Transformed dataset derived from UKBench dataset: An image from different groups of the UKBench dataset was taken and the following 5 transformations were applied to the original image:

    • Random crop preserving the original aspect ratio (new size - 560 x 420)
    • Horizontal flip
    • Vertical flip
    • 25 degree rotation
    • Resizing with change in aspect ratio (new aspect ratio - 1:1)

    Thus, each group has a total of 6 images (original + transformed). A total of 1800 such groups were created totalling 10800 images in the dataset.

  3. Exact duplicate dataset: An image from each of the 2550 image groups of the UKBench dataset was taken and an exact duplicate was created. The number of images totalled 5100.

"},{"location":"user_guide/benchmarks/#environment","title":"Environment","text":"

The benchmarks were performed on an AWS ec2 r5.xlarge instance having 4 vCPUs and 32 GB memory. The instance does not have a GPU, so all the runs are done on CPUs.

"},{"location":"user_guide/benchmarks/#metrics","title":"Metrics","text":"

The metrics used here are classification metrics as explained in the documentation.

class-0 refers to non-duplicate image pairs.

class-1 refers to duplicate image pairs.

The reported numbers are rounded off to nearest 3 digits.

"},{"location":"user_guide/benchmarks/#timings","title":"Timings","text":"

The times are reported in seconds and comprise the time taken to generate encodings and find duplicates. The time taken to perform the evaluation task is NOT reported.

"},{"location":"user_guide/benchmarks/#threshold-selection","title":"Threshold selection","text":"

For each method, 3 different thresholds have been selected.

For hashing methods, following max_distance_threshold values are used:

  • 0: Indicates that exactly the same hash should be generated for the image pairs to be considered duplicates.
  • 10: Default.
  • 32: Halfway between the maximum and minimum values (0 and 64).

For cnn method, following min_similarity_threshold values are used:

  • 1.0: Indicates that exactly the same cnn embeddings should be generated for the image pairs to be considered duplicates.
  • 0.9: Default.
  • 0.5: A threshold that allows large deviation between image pairs.
"},{"location":"user_guide/benchmarks/#results","title":"Results","text":""},{"location":"user_guide/benchmarks/#near-duplicate-dataset","title":"Near Duplicate dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 35.570 0.999 0.0 1.0 0.0 dhash 10 35.810 0.999 0.018 0.999 0.0461 dhash 32 106.670 0.998 0.0 0.326 0.884 phash 0 40.073 0.999 1.0 1.0 0.0 phash 10 39.056 0.999 0.498 0.999 0.016 phash 32 98.835 0.998 0.0 0.343 0.856 ahash 0 36.171 0.999 0.282 0.999 0.002 ahash 10 36.560 0.999 0.012 0.996 0.193 ahash 32 97.170 0.999 0.000 0.448 0.932 whash 0 51.710 0.999 0.112 0.999 0.002 whash 10 51.940 0.999 0.008 0.993 0.199 whash 32 112.560 0.999 0.0 0.416 0.933 cnn 0.5 379.680 0.999 0.0 0.856 0.999 cnn 0.9 377.157 0.999 0.995 0.999 0.127 cnn 1.0 379.570 0.999 0.0 1.0 0.0"},{"location":"user_guide/benchmarks/#observations","title":"Observations","text":"
  • The cnn method with a threshold between 0.5 and 0.9 would work best for finding near duplicates. This is indicated by the extreme values class-1 precision and recall takes for the two thresholds.
  • Hashing methods do not perform well for finding near duplicates.
"},{"location":"user_guide/benchmarks/#transformed-dataset","title":"Transformed dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 25.360 0.999 1.0 1.0 0.040 dhash 10 25.309 0.999 0.138 0.999 0.117 dhash 32 108.960 0.990 0.0 0.336 0.872 phash 0 28.069 0.999 1.0 1.0 0.050 phash 10 28.075 0.999 0.341 0.999 0.079 phash 32 107.079 0.990 0.003 0.328 0.847 ahash 0 25.270 0.999 0.961 0.999 0.058 ahash 10 25.389 0.999 0.035 0.997 0.216 ahash 32 93.084 0.990 0.0 0.441 0.849 whash 0 40.390 0.999 0.917 0.999 0.061 whash 10 41.260 0.999 0.023 0.996 0.203 whash 32 109.630 0.990 0.0 0.410 0.853 cnn 0.5 397.380 0.999 0.003 0.852 0.999 cnn 0.9 392.090 0.999 0.999 0.990 0.384 cnn 1.0 396.250 0.990 0.0 1.0 0.0"},{"location":"user_guide/benchmarks/#observations_1","title":"Observations","text":"
  • The cnn method with threshold 0.9 seems to work best for finding transformed duplicates. A slightly lower min_similarity_threshold value could lead to a higher class-1 recall.
  • Hashing methods do not perform well for finding transformed duplicates. In reality, resized images get found easily, but all other transformations lead to a bad performance for hashing methods.
"},{"location":"user_guide/benchmarks/#exact-duplicates-dataset","title":"Exact duplicates dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 18.380 1.0 1.0 1.0 1.0 dhash 10 18.410 1.0 0.223 0.999 1.0 dhash 32 34.602 1.0 0.0 0.327 1.0 phash 0 19.780 1.0 1.0 1.0 1.0 phash 10 20.012 1.0 0.980 0.999 1.0 phash 32 34.054 1.0 0.0 0.344 1.0 ahash 0 18.180 1.0 0.998 0.999 1.0 ahash 10 18.228 1.0 0.044 0.995 1.0 ahash 32 31.961 1.0 0.0 0.448 1.0 whash 0 26.097 1.0 0.980 0.999 1.0 whash 10 26.056 1.0 0.029 0.993 1.0 whash 32 39.408 1.0 0.0 0.417 1.0 cnn 0.5 192.050 1.0 0.001 0.860 1.0 cnn 0.9 191.024 1.0 1.0 1.0 1.0 cnn 1.0 194.270 0.999 1.0 1.0 0.580*

* The value is low as opposed to the expected 1.0 because of the cosine_similarity function from scikit-learn (used within the package) which sometimes calculates the similarity to be slightly less than 1.0 even when the vectors are same.

"},{"location":"user_guide/benchmarks/#observations_2","title":"Observations","text":"
  • Difference hashing is the fastest (max_distance_threshold 0).
  • When using hashing methods for exact duplicates, keep max_distance_threshold to a low value. The value of 0 is good, but a slightly higher value should also work fine.
  • When using cnn method, keep min_similarity_threshold to a high value. The default value of 0.9 seems to work well. A slightly higher value can also be used.
"},{"location":"user_guide/benchmarks/#summary","title":"Summary","text":"
  • Near duplicate dataset: use cnn with an appropriate min_similarity_threshold.
  • Transformed dataset: use cnn with min_similarity_threshold of around 0.9 (default).
  • Exact duplicates dataset: use Difference hashing with 0 max_distance_threshold.
  • A higher max_distance_threshold (i.e., hashing) leads to a higher execution time. cnn method doesn't seem much affected by the min_similarity_threshold (though a lower value would add a few seconds to the execution time as can be seen in all the runs above.)
  • Generally speaking, the cnn method takes longer to run as compared to hashing methods for all datasets. If a GPU is available, cnn method should be much faster.
"},{"location":"user_guide/custom_model/","title":"Using custom models for CNN","text":"

To allow users to use custom models for encoding generation, we provide a CustomModel construct which serves as a wrapper for a user-defined feature extractor. The CustomModel consists of the following attributes:

  • name: The name of the custom model. Can be set to any string.
  • model: A PyTorch model object, which is a subclass of torch.nn.Module and implements the forward method. The output of the forward method should be a tensor of shape (batch_size x features) . Alternatively, a __call__ method is also accepted.
  • transform: A function that transforms a PIL.Image object into a PyTorch tensor. Should correspond to the preprocessing logic of the supplied model.

CustomModel is provided while initializing the cnn object and can be used in the following 2 scenarios:

  1. Using the models provided with the imagededup package. There are 3 models provided currently:
    • MobileNetV3 (MobileNetV3 Small)- This is the default.
    • ViT (Vision Transformer- B16 IMAGENET1K_SWAG_E2E_V1)
    • EfficientNet (EfficientNet B4- IMAGENET1K_V1)
from imagededup.methods import CNN\n\n# Get CustomModel construct\nfrom imagededup.utils import CustomModel\n\n# Get the prepackaged models from imagededup\nfrom imagededup.utils.models import ViT, MobilenetV3, EfficientNet\n\n\n# Declare a custom config with CustomModel, the prepackaged models come with a name and transform function\ncustom_config = CustomModel(name=EfficientNet.name,\n                            model=EfficientNet(), \n                            transform=EfficientNet.transform)\n\n# Use model_config argument to pass the custom config\ncnn = CNN(model_config=custom_config)\n\n# Use the model as usual\n...\n

2.Using a user-defined custom model.

from imagededup.methods import CNN\n\n# Get CustomModel construct\nfrom imagededup.utils import CustomModel\n\n# Import necessary pytorch constructs for initializing a custom feature extractor\nimport torch\nfrom torchvision.transforms import transforms\n\n# Declare custom feature extractor class\nclass MyModel(torch.nn.Module):\n    transform = transforms.Compose(\n        [\n            transforms.ToTensor()\n        ]\n    )\n    name = 'my_custom_model'\n\n    def __init__(self):\n        super().__init__()\n        # Define the layers of the model here\n\n    def forward(self, x):\n        # Do something with x\n        return x\n\ncustom_config = CustomModel(name=MyModel.name,\n                            model=MyModel(),\n                            transform=MyModel.transform)\n\ncnn = CNN(model_config=custom_config)\n\n# Use the model as usual\n...\n

It is not necessary to bundle name and transform functions with the model class. They can be passed separately as well.

Examples for both scenarios can be found in the examples section.

"},{"location":"user_guide/encoding_generation/","title":"Encoding generation","text":"

It might be desirable to only generate the hashes/cnn encodings for a given image or all images in a directory instead of directly deduplicating using find_duplicates method. Encodings can be generated for a directory of images or for a single image:

  • Encoding generation for all images in a directory
  • Encoding generation for a single image
"},{"location":"user_guide/encoding_generation/#encoding-generation-for-all-images-in-a-directory","title":"Encoding generation for all images in a directory","text":"

To generate encodings for all images in an image directory encode_images function can be used. The general api for using encode_images is:

from imagededup.methods import <method-name>\nmethod_object = <method-name>()\nencodings = method_object.encode_images(image_dir='path/to/image/directory')\n

where the returned variable encodings is a dictionary mapping image file names to corresponding encoding:

{\n  'image1.jpg': <encoding-image-1>,\n  'image2.jpg': <encoding-image-2>,\n   ..\n}\n

For hashing algorithms, the encodings are 64 bit hashes represented as 16 character hexadecimal strings.

For cnn, the encodings are numpy array with shape (576,).

The 'method-name' corresponds to one of the deduplication methods available and can be set to:

  • PHash
  • AHash
  • DHash
  • WHash
  • CNN
"},{"location":"user_guide/encoding_generation/#options","title":"Options","text":"
  • image_dir: Path to the image directory for which encodings are to be generated.
  • recursive: finding images recursively in a nested directory structure, set to False by default.
"},{"location":"user_guide/encoding_generation/#considerations","title":"Considerations","text":"
  • If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, there is no entry for the image in the returned encodings dictionary.
  • Supported image formats: 'JPEG', 'PNG', 'BMP', 'MPO', 'PPM', 'TIFF', 'GIF', 'SVG', 'PGM', 'PBM', 'WEBP'.
"},{"location":"user_guide/encoding_generation/#examples","title":"Examples","text":"

Generating encodings using Difference hash:

from imagededup.methods import DHash\ndhasher = DHash()\nencodings = dhasher.encode_images(image_dir='path/to/image/directory')\n
"},{"location":"user_guide/encoding_generation/#encoding-generation-for-a-single-image","title":"Encoding generation for a single image","text":"

To generate encodings for a single image encode_image function can be used. The general api for using encode_image is:

from imagededup.methods import <method-name>\nmethod_object = <method-name>()\nencoding = method_object.encode_image(image_file='path/to/image/file')\n

where the returned variable encoding is either a hexadecimal string if a hashing method is used or a (576,) numpy array if cnn is used.

"},{"location":"user_guide/encoding_generation/#options_1","title":"Options","text":"
  • image_file: Optional, path to the image file for which encodings are to be generated.
  • image_array: Optional, used instead of image_file attribute. A numpy array representing the image.
"},{"location":"user_guide/encoding_generation/#considerations_1","title":"Considerations","text":"
  • If the image can't be loaded, no encodings are generated for the image and None is returned.
  • Supported image formats: 'JPEG', 'PNG', 'BMP', 'MPO', 'PPM', 'TIFF', 'GIF', 'SVG', 'PGM', 'PBM', 'WEBP'.
"},{"location":"user_guide/encoding_generation/#examples_1","title":"Examples","text":"

Generating encodings using Difference hash:

from imagededup.methods import DHash\ndhasher = DHash()\nencoding = dhasher.encode_image(image_file='path/to/image/file')\n
"},{"location":"user_guide/evaluating_performance/","title":"Evaluation of deduplication quality","text":"

To determine the quality of deduplication algorithm and the corresponding threshold, an evaluation framework is provided.

Given a ground truth mapping consisting of file names and a list of duplicates for each file along with a retrieved mapping from the deduplication algorithm for the same files, the following metrics can be obtained using the framework:

  • Mean Average Precision (MAP)
  • Mean Normalized Discounted Cumulative Gain (NDCG)
  • Jaccard Index
  • Per class Precision (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)
  • Per class Recall (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)
  • Per class f1-score (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)

The api for obtaining these metrics is as below:

from imagededup.evaluation import evaluate\nmetrics = evaluate(ground_truth_map, retrieved_map, metric='<metric-name>')\n

where the returned variable metrics is a dictionary containing the following content:

{\n  'map': <map>,\n  'ndcg': <mean ndcg>,\n  'jaccard': <mean jaccard index>,\n  'precision': <numpy array having per class precision>,\n  'recall': <numpy array having per class recall>,\n  'f1-score': <numpy array having per class f1-score>,\n  'support': <numpy array having per class support>\n}\n
"},{"location":"user_guide/evaluating_performance/#options","title":"Options","text":"
  • ground_truth_map: A dictionary representing ground truth with filenames as key and a list of duplicate filenames as value.
  • retrieved_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved duplicate filenames as value.
  • metric: Can take one of the following values:
    • 'map'
    • 'ndcg'
    • 'jaccard'
    • 'classification': Returns per class precision, recall, f1-score, support
    • 'all' (default, returns all the above metrics)
"},{"location":"user_guide/evaluating_performance/#considerations","title":"Considerations","text":"
  • Presently, the ground truth map should be prepared manually by the user. Symmetric relations between duplicates must be represented in the ground truth map. If an image i is a duplicate of image j, then j must also be represented as a duplicate of i. Absence of symmetric relations will lead to an exception.

  • Both the ground_truth_map and retrieved_map must have the same keys.

  • There is a difference between the way information retrieval metrics(map, ndcg, jaccard index) and classification metrics(precision, recall, f1-score) treat the symmetric relationships in duplicates. Consider the following ground_truth_map and retrieved_map:

ground_truth_map:

{\n  '1.jpg': ['2.jpg', '4.jpg'],\n  '2.jpg': ['1.jpg'],\n  '3.jpg': [],\n  '4.jpg': ['1.jpg']\n}\n

retrieved_map:

{\n  '1.jpg': ['2.jpg'],\n  '2.jpg': ['1.jpg'],\n  '3.jpg': [],\n  '4.jpg': []\n}\n

From the above, it can be seen that images '1.jpg' and '4.jpg' are not found to be duplicates of each other by the deduplication algorithm.

For calculating information retrieval metrics, each key in the maps is considered as an independent 'query'. In the ground truth, '4.jpg' is a duplicate of the key '1.jpg'. When it is not retrieved, it is considered a miss for query '1.jpg'. Similarly, '1.jpg' is a duplicate of the key '4.jpg' in the ground truth. When this is not retrieved, it is considered a miss for query '4.jpg'. Thus, the missing relationship is accounted for twice instead of just once.

Classification metrics, on the other hand, consider the relationships only once by forming unique pairs of images and labelling each pair as a 0 (non-duplicate image pair) and 1 (duplicate image pair).

Using the ground_truth_map, the ground truth pairs with the corresponding labels are:

Image Pair Label ('1.jpg', '2.jpg') 1 ('1.jpg', '3.jpg') 0 ('1.jpg', '4.jpg') 1 ('2.jpg', '3.jpg') 0 ('2.jpg', '4.jpg') 0 ('3.jpg', '4.jpg') 0

Similarly, using retrieved_map, the retrieved pairs are generated:

Image Pair Label ('1.jpg', '2.jpg') 1 ('1.jpg', '3.jpg') 0 ('1.jpg', '4.jpg') 0 ('2.jpg', '3.jpg') 0 ('2.jpg', '4.jpg') 0 ('3.jpg', '4.jpg') 0

These two sets of pairs are then used to calculate metrics such as precision/recall/f1-score. It can be seen that the missing relationship between pair ('1jpg', '4.jpg') is accounted for only once.

"},{"location":"user_guide/finding_duplicates/","title":"Finding duplicates","text":"

There are two methods available to find duplicates:

  • find_duplicates()
  • find_duplicates_to_remove()
"},{"location":"user_guide/finding_duplicates/#find_duplicates","title":"find_duplicates()","text":"

To find duplicates in an image directory, the general api is:

from imagededup.methods import <method-name>\nmethod_object = <method-name>()\nduplicates = method_object.find_duplicates(image_dir='path/to/image/directory',\n                                           <threshold-parameter-value>)\n

Duplicates can also be found if encodings of the images are available:

from imagededup.methods import <method-name>\nmethod_object = <method-name>()\nduplicates = method_object.find_duplicates(encoding_map,\n                                           <threshold-parameter-value>)\n

where the returned variable duplicates is a dictionary with the following content:

{\n  'image1.jpg': ['image1_duplicate1.jpg',\n                'image1_duplicate2.jpg'],\n  'image2.jpg': [..],\n  ..\n}\n

Each key in the duplicates dictionary corresponds to a file in the image directory passed to the image_dir parameter of the find_duplicates function. The value is a list of all file names in the image directory that were found to be duplicates for the key file. The 'method-name' corresponds to one of the deduplication methods available and can be set to:

  • PHash
  • AHash
  • DHash
  • WHash
  • CNN
"},{"location":"user_guide/finding_duplicates/#options","title":"Options","text":"
  • image_dir: Optional, directory where all image files are present.

  • encoding_map: Optional, used instead of image_dir attribute. Set it equal to the dictionary of file names and corresponding encodings (hashes/cnn encodings). The mentioned dictionary can be generated using the corresponding encode_images method.

  • scores: Setting it to True returns the scores representing the hamming distance (for hashing) or cosine similarity (for cnn) of each of the duplicate file names from the key file. In this case, the returned 'duplicates' dictionary has the following content:
{\n  'image1.jpg': [('image1_duplicate1.jpg', score),\n                 ('image1_duplicate2.jpg', score)],\n  'image2.jpg': [..],\n  ..\n}\n

Each key in the duplicates dictionary corresponds to a file in the image directory passed to the image_dir parameter of the find_duplicates function. The value is a list of tuples representing the file names and corresponding scores in the image directory that were found to be duplicates of the key file.

  • outfile: Name of file to which the returned duplicates dictionary is to be written, must be a json. None by default.

  • threshold parameter:

    • min_similarity_threshold for cnn method indicating the minimum amount of cosine similarity that should exist between the key image and a candidate image so that the candidate image can be considered as a duplicate of the key image. Should be a float between -1.0 and 1.0. Default value is 0.9.

    • max_distance_threshold for hashing methods indicating the maximum amount of hamming distance that can exist between the key image and a candidate image so that the candidate image can be considered as a duplicate of the key image. Should be an int between 0 and 64. Default value is 10.

  • recursive: finding images recursively in a nested directory structure, set to False by default.

"},{"location":"user_guide/finding_duplicates/#considerations","title":"Considerations","text":"
  • The returned duplicates dictionary contains symmetric relationships i.e., if an image i is a duplicate of image j, then image j must also be a duplicate of image i. Let's say that the image directory only consists of images i and j, then the duplicates dictionary would have the following content:
{\n  'i': ['j'],\n  'j': ['i']\n}\n
  • If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, the image is disregarded for deduplication and has no entry in the returned duplicates dictionary.
"},{"location":"user_guide/finding_duplicates/#examples","title":"Examples","text":"

To deduplicate an image directory using perceptual hashing, with a maximum allowed hamming distance of 12, scores returned along with duplicate filenames and the returned dictionary saved to file 'my_duplicates.json', use the following:

from imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates(image_dir='path/to/image/directory',\n                                     max_distance_threshold=12, \n                                     scores=True, \n                                     outfile='my_duplicates.json')\n

To deduplicate an image directory using cnn, with a minimum cosine similarity of 0.85, no scores returned and the returned dictionary saved to file 'my_duplicates.json', use the following:

from imagededup.methods import CNN\ncnn_encoder = CNN()\nduplicates = cnn_encoder.find_duplicates(image_dir='path/to/image/directory', \n                                         min_similarity_threshold=0.85, \n                                         scores=False, \n                                         outfile='my_duplicates.json')\n
"},{"location":"user_guide/finding_duplicates/#find_duplicates_to_remove","title":"find_duplicates_to_remove()","text":"

Returns a list of files in the image directory that are considered as duplicates. Does NOT remove the said files.

The api is similar to find_duplicates function (except the score attribute in find_duplicates). This function allows the return of a single list of file names in directory that are found to be duplicates. The general api for the method is as below:

from imagededup.methods import <method-name>\nmethod_object = <method-name>()\nduplicates = method_object.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                                     <threshold-parameter-value>)\nOR\n\nduplicates = method_object.find_duplicates_to_remove(encoding_map=encoding_map, \n                                                     <threshold-parameter-value>)\n

In this case, the returned variable duplicates is a list containing the name of image files that are found to be duplicates of some file in the directory:

[\n  'image1_duplicate1.jpg',\n  'image1_duplicate2.jpg'\n  ,..\n]\n

The 'method-name' corresponds to one of the deduplication methods available and can be set to:

  • PHash
  • AHash
  • DHash
  • WHash
  • CNN
"},{"location":"user_guide/finding_duplicates/#options_1","title":"Options","text":"
  • image_dir: Optional, directory where all image files are present.

  • encoding_map: Optional, used instead of image_dir attribute. Set it equal to the dictionary of file names and corresponding encodings (hashes/cnn encodings). The mentioned dictionary can be generated using the corresponding encode_images method.

  • outfile: Name of file to which the returned duplicates dictionary is to be written, must be a json. None by default.

  • threshold parameter:

    • min_similarity_threshold for cnn method indicating the minimum amount of cosine similarity that should exist between the key image and a candidate image so that the candidate image can be considered as a duplicate for the key image. Should be a float between -1.0 and 1.0. Default value is 0.9.

    • max_distance_threshold for hashing methods indicating the maximum amount of hamming distance that can exist between the key image and a candidate image so that the candidate image can be considered as a duplicate for the key image. Should be an int between 0 and 64. Default value is 10.

  • recursive: finding images recursively in a nested directory structure, set to False by default.

"},{"location":"user_guide/finding_duplicates/#considerations_1","title":"Considerations","text":"
  • This method must be used with caution. The symmetric nature of duplicates imposes an issue of marking one image as duplicate and the other as original. Consider the following duplicates dictionary:
{\n  '1.jpg': ['2.jpg'],\n  '2.jpg': ['1.jpg', '3.jpg'],\n  '3.jpg': ['2.jpg']\n}\n

In this case, it is possible to remove only 2.jpg which leaves 1.jpg and 3.jpg as non-duplicates of each other. However, it is also possible to remove both 1.jpg and 3.jpg leaving only 2.jpg. The find_duplicates_to_remove method can thus, return either of the outputs. In the above example, let's say that 1.jpg is retained, while its duplicate, 2.jpg, is marked as a duplicate. Once 2.jpg is marked as duplicate, its own found duplicates would be disregarded. Thus, 1.jpg and 3.jpg would not be considered as duplicates. So, the final return would be:

['2.jpg']\n

This leaves 1.jpg and 3.jpg as non-duplicates in the directory. If the user does not wish to impose this heuristic, it is advised to use find_duplicates function and use a custom heuristic to mark a file as duplicate.

  • If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, the image is disregarded for deduplication and has no entry in the returned duplicates dictionary.
"},{"location":"user_guide/finding_duplicates/#examples_1","title":"Examples","text":"

To deduplicate an image directory using perceptual hashing, with a maximum allowed hamming distance of 12, and the returned list saved to file 'my_duplicates.json', use the following:

from imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                               max_distance_threshold=12, \n                                               outfile='my_duplicates.json')\n

To deduplicate an image directory using cnn, with a minimum cosine similarity of 0.85 and the returned list saved to file 'my_duplicates.json', use the following:

from imagededup.methods import CNN\ncnn_encoder = CNN()\nduplicates = cnn_encoder.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                                   min_similarity_threshold=0.85, \n                                                   outfile='my_duplicates.json')\n
"},{"location":"user_guide/plotting_duplicates/","title":"Plotting duplicates of an image","text":"

Once a duplicate dictionary corresponding to an image directory has been obtained (using find_duplicates), duplicates for an image can be plotted using plot_duplicates method as below:

from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir, duplicate_map, filename, outfile=None)\n

where filename is the file for which duplicates are to be plotted.

"},{"location":"user_guide/plotting_duplicates/#options","title":"Options","text":"
  • image_dir: Directory where all image files are present.

  • duplicate_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved duplicate filenames as value. A duplicate_map with scores can also be passed (obtained from find_duplicates function with scores attribute set to True).

  • filename: Image file name for which duplicates are to be plotted.

  • outfile: Optional, name of the file the plot should be saved to. None by default.

The output looks as below:

"},{"location":"utils/data_generator/","title":"Data generator","text":""},{"location":"utils/data_generator/#img_dataloader","title":"img_dataloader","text":"
def img_dataloader(image_dir, batch_size, basenet_preprocess, recursive, num_workers)\n
"},{"location":"utils/data_generator/#class-imgdataset","title":"class ImgDataset","text":""},{"location":"utils/data_generator/#__init__","title":"__init__","text":"
def __init__(image_dir, basenet_preprocess, recursive)\n
"},{"location":"utils/data_generator/#__len__","title":"__len__","text":"
def __len__()\n

Number of images.

"},{"location":"utils/data_generator/#__getitem__","title":"__getitem__","text":"
def __getitem__(item)\n
"},{"location":"utils/general_utils/","title":"General utils","text":""},{"location":"utils/general_utils/#get_files_to_remove","title":"get_files_to_remove","text":"
def get_files_to_remove(duplicates)\n

Get a list of files to remove.

"},{"location":"utils/general_utils/#args","title":"Args","text":"
  • duplicates: A dictionary with file name as key and a list of duplicate file names as value.
"},{"location":"utils/general_utils/#returns","title":"Returns","text":""},{"location":"utils/general_utils/#save_json","title":"save_json","text":"
def save_json(results, filename, float_scores)\n

Save results with a filename.

"},{"location":"utils/general_utils/#args_1","title":"Args","text":"
  • results: Dictionary of results to be saved.

  • filename: Name of the file to be saved.

  • float_scores: boolean to indicate if scores are floats.

"},{"location":"utils/general_utils/#parallelise","title":"parallelise","text":"
def parallelise(function, data, verbose, num_workers)\n
"},{"location":"utils/general_utils/#generate_files","title":"generate_files","text":"
def generate_files(image_dir, recursive)\n
"},{"location":"utils/general_utils/#generate_relative_names","title":"generate_relative_names","text":"
def generate_relative_names(image_dir, files)\n
"},{"location":"utils/image_utils/","title":"Image utils","text":""},{"location":"utils/image_utils/#check_image_array_hash","title":"check_image_array_hash","text":"
def check_image_array_hash(image_arr)\n

Checks the sanity of the input image numpy array for hashing functions.

"},{"location":"utils/image_utils/#args","title":"Args","text":"
  • image_arr: Image array.
"},{"location":"utils/image_utils/#expand_image_array_cnn","title":"expand_image_array_cnn","text":"
def expand_image_array_cnn(image_arr)\n

Checks the sanity of the input image numpy array for cnn and converts the grayscale numpy array to rgb by repeating the array thrice along the 3rd dimension if a 2-dimensional image array is provided.

"},{"location":"utils/image_utils/#args_1","title":"Args","text":"
  • image_arr: Image array.
"},{"location":"utils/image_utils/#returns","title":"Returns","text":""},{"location":"utils/image_utils/#preprocess_image","title":"preprocess_image","text":"
def preprocess_image(image, target_size, grayscale)\n

Take as input an image as numpy array or Pillow format. Returns an array version of optionally resized and grayed image.

"},{"location":"utils/image_utils/#args_2","title":"Args","text":"
  • image: numpy array or a pillow image.

  • target_size: Size to resize the input image to.

  • grayscale: A boolean indicating whether to grayscale the image.

"},{"location":"utils/image_utils/#returns_1","title":"Returns","text":""},{"location":"utils/image_utils/#load_image","title":"load_image","text":"
def load_image(image_file, target_size, grayscale, img_formats)\n

Load an image given its path. Returns an array version of optionally resized and grayed image. Only allows images of types described by img_formats argument.

"},{"location":"utils/image_utils/#args_3","title":"Args","text":"
  • image_file: Path to the image file.

  • target_size: Size to resize the input image to.

  • grayscale: A boolean indicating whether to grayscale the image.

  • img_formats: List of allowed image formats that can be loaded.

"},{"location":"utils/logger/","title":"Logger","text":""},{"location":"utils/logger/#return_logger","title":"return_logger","text":"
def return_logger(name)\n
"},{"location":"utils/models/","title":"Custom Models","text":""},{"location":"utils/models/#class-custommodel","title":"class CustomModel","text":"

A named tuple that can be used to initialize a custom PyTorch model.

"},{"location":"utils/models/#args","title":"Args","text":"
  • name: The name of the custom model. Default is 'default_model'.

  • model: The PyTorch model object which is a subclass of torch.nn.Module and implements the forward method and output a tensor of shape (batch_size x features). Alternatively, a call method is also accepted.. Default is None.

  • transform: A function that transforms a PIL.Image object into a PyTorch tensor that will be applied to each image before being fed to the model. Should correspond to the preprocessing logic of the supplied model. Default is None.

"},{"location":"utils/models/#class-mobilenetv3","title":"class MobilenetV3","text":""},{"location":"utils/models/#__init__","title":"__init__","text":"
def __init__()\n

Initialize a mobilenetv3 model, cuts it at the global average pooling layer and returns the output features.

"},{"location":"utils/models/#forward","title":"forward","text":"
def forward(x)\n
"},{"location":"utils/models/#class-vit","title":"class ViT","text":""},{"location":"utils/models/#__init___1","title":"__init__","text":"
def __init__()\n

Initialize a ViT model, takes mean of the final encoder layer outputs and returns those as features for a given image.

"},{"location":"utils/models/#forward_1","title":"forward","text":"
def forward(x)\n
"},{"location":"utils/models/#class-efficientnet","title":"class EfficientNet","text":""},{"location":"utils/models/#__init___2","title":"__init__","text":"
def __init__()\n

Initializes an EfficientNet model, cuts it at the global average pooling layer and returns the output features.

"},{"location":"utils/models/#forward_2","title":"forward","text":"
def forward(x)\n
"},{"location":"utils/plotter/","title":"Plot duplicates","text":""},{"location":"utils/plotter/#plot_duplicates","title":"plot_duplicates","text":"
def plot_duplicates(image_dir, duplicate_map, filename, outfile)\n

Given filename for an image, plot duplicates along with the original image using the duplicate map obtained using find_duplicates method.

"},{"location":"utils/plotter/#args","title":"Args","text":"
  • image_dir: image directory where all files in duplicate_map are present.

  • duplicate_map: mapping of filename to found duplicates (could be with or without scores).

  • filename: Name of the file for which duplicates are to be plotted, must be a key in the duplicate_map.

  • outfile: Optional, name of the file to save the plot. Default is None.

"},{"location":"utils/plotter/#example-usage","title":"Example usage","text":"
from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\nduplicate_map=duplicate_map,\nfilename='path/to/image.jpg')\n
"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Image Deduplicator (imagededup)","text":"

imagededup is a python package that simplifies the task of finding exact and near duplicates in an image collection.

This package provides functionality to make use of hashing algorithms that are particularly good at finding exact duplicates as well as convolutional neural networks which are also adept at finding near duplicates. An evaluation framework is also provided to judge the quality of deduplication for a given dataset.

Following details the functionality provided by the package:

  • Finding duplicates in a directory using one of the following algorithms:
  • Convolutional Neural Network (CNN) - Select from several prepackaged models or provide your own custom model.
  • Perceptual hashing (PHash)
  • Difference hashing (DHash)
  • Wavelet hashing (WHash)
  • Average hashing (AHash)
  • Generation of encodings for images using one of the above stated algorithms.
  • Framework to evaluate effectiveness of deduplication given a ground truth mapping.
  • Plotting duplicates found for a given image file.

Detailed documentation for the package can be found at: https://idealo.github.io/imagededup/

imagededup is compatible with Python 3.8+ and runs on Linux, MacOS X and Windows. It is distributed under the Apache 2.0 license.

"},{"location":"#contents","title":"\ud83d\udcd6 Contents","text":"
  • Installation
  • Quick Start
  • Benchmarks
  • Contribute
  • Citation
  • Maintainers
  • License
"},{"location":"#installation","title":"\u2699\ufe0f Installation","text":"

There are two ways to install imagededup:

  • Install imagededup from PyPI (recommended):
pip install imagededup\n
  • Install imagededup from the GitHub source:
git clone https://github.com/idealo/imagededup.git\ncd imagededup\npip install \"cython>=0.29\"\npython setup.py install\n
"},{"location":"#quick-start","title":"\ud83d\ude80 Quick Start","text":"

In order to find duplicates in an image directory using perceptual hashing, following workflow can be used:

  • Import perceptual hashing method
from imagededup.methods import PHash\nphasher = PHash()\n
  • Generate encodings for all images in an image directory
encodings = phasher.encode_images(image_dir='path/to/image/directory')\n
  • Find duplicates using the generated encodings
duplicates = phasher.find_duplicates(encoding_map=encodings)\n
  • Plot duplicates obtained for a given file (eg: 'ukbench00120.jpg') using the duplicates dictionary
from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\n                duplicate_map=duplicates,\n                filename='ukbench00120.jpg')\n

The output looks as below:

The complete code for the workflow is:

from imagededup.methods import PHash\nphasher = PHash()\n\n# Generate encodings for all images in an image directory\nencodings = phasher.encode_images(image_dir='path/to/image/directory')\n\n# Find duplicates using the generated encodings\nduplicates = phasher.find_duplicates(encoding_map=encodings)\n\n# plot duplicates obtained for a given file using the duplicates dictionary\nfrom imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\n                duplicate_map=duplicates,\n                filename='ukbench00120.jpg')\n

It is also possible to use your own custom models for finding duplicates using the CNN method.

For examples, refer this part of the repository.

For more detailed usage of the package functionality, refer: https://idealo.github.io/imagededup/

"},{"location":"#benchmarks","title":"\u23f3 Benchmarks","text":"

Update: Provided benchmarks are only valid upto imagededup v0.2.2. The next releases have significant changes to all methods, so the current benchmarks may not hold.

Detailed benchmarks on speed and classification metrics for different methods have been provided in the documentation. Generally speaking, following conclusions can be made:

  • CNN works best for near duplicates and datasets containing transformations.
  • All deduplication methods fare well on datasets containing exact duplicates, but Difference hashing is the fastest.
"},{"location":"#contribute","title":"\ud83e\udd1d Contribute","text":"

We welcome all kinds of contributions. See the Contribution guide for more details.

"},{"location":"#citation","title":"\ud83d\udcdd Citation","text":"

Please cite Imagededup in your publications if this is useful for your research. Here is an example BibTeX entry:

@misc{idealods2019imagededup,\n  title={Imagededup},\n  author={Tanuj Jain and Christopher Lennan and Zubin John and Dat Tran},\n  year={2019},\n  howpublished={\\url{https://github.com/idealo/imagededup}},\n}\n
"},{"location":"#maintainers","title":"\ud83c\udfd7 Maintainers","text":"
  • Tanuj Jain, github: tanujjain
  • Christopher Lennan, github: clennan
  • Dat Tran, github: datitran
"},{"location":"#copyright","title":"\u00a9 Copyright","text":"

See LICENSE for details.

"},{"location":"CONTRIBUTING/","title":"Contribution Guide","text":"

We welcome any contributions whether it is:

  • Submitting feedback
  • Fixing bugs
  • Or implementing a new feature.

Please read this guide before making any contributions.

"},{"location":"CONTRIBUTING/#submit-feedback","title":"Submit Feedback","text":"

The feedback should be submitted by creating an issue on GitHub issues. Select the related template (bug report, feature request, or custom) and add the corresponding labels.

"},{"location":"CONTRIBUTING/#fix-bugs","title":"Fix Bugs","text":"

You may look through the GitHub issues for bugs.

"},{"location":"CONTRIBUTING/#implement-features","title":"Implement Features","text":"

You may look through the GitHub issues for feature requests.

"},{"location":"CONTRIBUTING/#pull-requests-pr","title":"Pull Requests (PR)","text":"
  1. Fork the repository and create a new branch from the dev branch.
  2. For bug fixes, add new tests and for new features, please add changes to the documentation.
  3. Do a PR from your new branch to our dev branch of the original Imagededup repo.
"},{"location":"CONTRIBUTING/#documentation","title":"Documentation","text":"
  • Make sure any new function or class you introduce has proper docstrings.
"},{"location":"CONTRIBUTING/#testing","title":"Testing","text":"
  • We use pytest for our testing. Make sure to write tests for any new feature and/or bug fixes.
"},{"location":"CONTRIBUTING/#main-contributor-list","title":"Main Contributor List","text":"

We maintain a list of main contributors to appreciate all the contributions.

"},{"location":"LICENSE/","title":"License","text":"

Copyright 2019 idealo internet GmbH. All rights reserved.

                             Apache License\n                       Version 2.0, January 2004\n                    http://www.apache.org/licenses/\n

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

  1. Definitions.

    \"License\" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.

    \"Licensor\" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.

    \"Legal Entity\" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, \"control\" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.

    \"You\" (or \"Your\") shall mean an individual or Legal Entity exercising permissions granted by this License.

    \"Source\" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.

    \"Object\" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.

    \"Work\" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).

    \"Derivative Works\" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.

    \"Contribution\" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, \"submitted\" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as \"Not a Contribution.\"

    \"Contributor\" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.

  2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.

  3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.

  4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:

    (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and

    (b) You must cause any modified files to carry prominent notices stating that You changed the files; and

    (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and

    (d) If the Work includes a \"NOTICE\" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.

    You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.

  5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.

  6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.

  7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.

  8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.

  9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS

APPENDIX: How to apply the Apache License to your work.

  To apply the Apache License to your work, attach the following\n  boilerplate notice, with the fields enclosed by brackets \"[]\"\n  replaced with your own identifying information. (Don't include\n  the brackets!)  The text should be enclosed in the appropriate\n  comment syntax for the file format. We also recommend that a\n  file or class name and description of purpose be included on the\n  same \"printed page\" as the copyright notice for easier\n  identification within third-party archives.\n

Copyright [yyyy] [name of copyright owner]

Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0\n

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

"},{"location":"evaluation/evaluation/","title":"Evaluation","text":""},{"location":"evaluation/evaluation/#evaluate","title":"evaluate","text":"
def evaluate(ground_truth_map, retrieved_map, metric)\n

Given a ground truth map and a duplicate map retrieved from a deduplication algorithm, get metrics to evaluate the effectiveness of the applied deduplication algorithm.

"},{"location":"evaluation/evaluation/#args","title":"Args","text":"
  • ground_truth_map: A dictionary representing ground truth with filenames as key and a list of duplicate filenames as value.

  • retrieved_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved duplicate filenames as value.

  • metric: Name of metric to be evaluated and returned. Accepted values are: 'map', 'ndcg', 'jaccard', 'classification', 'all'(default, returns every metric).

"},{"location":"evaluation/evaluation/#returns","title":"Returns","text":"
  • dictionary: A dictionary with metric name as key and corresponding calculated metric as the value. 'map', 'ndcg' and 'jaccard' return a single number denoting the corresponding information retrieval metric. 'classification' metrics include 'precision', 'recall' and 'f1-score' which are returned in the form of individual entries in the returned dictionary. The value for each of the classification metric is a numpy array with first entry as the score for non-duplicate file pairs(class-0) and second entry as the score for duplicate file pairs (class-1). Additionally, a support is also returned as another key with first entry denoting number of non-duplicate file pairs and second entry having duplicate file pairs.
"},{"location":"examples/CIFAR10_deduplication/","title":"CIFAR10 deduplication example","text":""},{"location":"examples/CIFAR10_deduplication/#install-imagededup-via-pypi","title":"Install imagededup via PyPI","text":"
!pip install imagededup\n
"},{"location":"examples/CIFAR10_deduplication/#download-cifar10-dataset-and-untar","title":"Download CIFAR10 dataset and untar","text":"
!wget http://pjreddie.com/media/files/cifar.tgz\n!tar xzf cifar.tgz\n
"},{"location":"examples/CIFAR10_deduplication/#create-working-directory-and-move-all-images-into-this-directory","title":"Create working directory and move all images into this directory","text":"
image_dir = 'cifar10_images'\n!mkdir $image_dir\n!cp -r '/content/cifar/train/.' $image_dir\n!cp -r '/content/cifar/test/.' $image_dir\n
"},{"location":"examples/CIFAR10_deduplication/#find-duplicates-in-the-entire-dataset-with-cnn","title":"Find duplicates in the entire dataset with CNN","text":"
from imagededup.methods import CNN\n\ncnn = CNN()\nencodings = cnn.encode_images(image_dir=image_dir)\nduplicates = cnn.find_duplicates(encoding_map=encodings)\n
"},{"location":"examples/CIFAR10_deduplication/#do-some-imports-for-plotting","title":"Do some imports for plotting","text":"
from pathlib import Path\nfrom imagededup.utils import plot_duplicates\nimport matplotlib.pyplot as plt\nplt.rcParams['figure.figsize'] = (15, 10)\n
"},{"location":"examples/CIFAR10_deduplication/#find-and-plot-duplicates-in-the-test-set-with-cnn","title":"Find and plot duplicates in the test set with CNN","text":"
# test images are stored under '/content/cifar/test'\nfilenames_test = set([i.name for i in Path('/content/cifar/test').glob('*.png')])\n\nduplicates_test = {}\nfor k, v in duplicates.items():\n  if k in filenames_test:\n    tmp = [i for i in v if i in filenames_test]\n    duplicates_test[k] = tmp\n\n# sort in descending order of duplicates\nduplicates_test = {k: v for k, v in sorted(duplicates_test.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_test, filename=list(duplicates_test.keys())[0])\n
"},{"location":"examples/CIFAR10_deduplication/#find-and-plot-duplicates-in-the-train-set-with-cnn","title":"Find and plot duplicates in the train set with CNN","text":"
# train images are stored under '/content/cifar/train'\nfilenames_train = set([i.name for i in Path('/content/cifar/train').glob('*.png')])\n\nduplicates_train = {}\nfor k, v in duplicates.items():\n  if k in filenames_train:\n    tmp = [i for i in v if i in filenames_train]\n    duplicates_train[k] = tmp\n\n\n# sort in descending order of duplicates\nduplicates_train = {k: v for k, v in sorted(duplicates_train.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_train, filename=list(duplicates_train.keys())[0])\n
"},{"location":"examples/CIFAR10_deduplication/#examples-from-test-set-with-duplicates-in-train-set","title":"Examples from test set with duplicates in train set","text":"
# keep only filenames that are in test set have duplicates in train set\nduplicates_test_train = {}\nfor k, v in duplicates.items():\n    if k in filenames_test:\n        tmp = [i for i in v if i in filenames_train]\n        duplicates_test_train[k] = tmp\n\n# sort in descending order of duplicates\nduplicates_test_train = {k: v for k, v in sorted(duplicates_test_train.items(), key=lambda x: len(x[1]), reverse=True)}\n\n# plot duplicates found for some file\nplot_duplicates(image_dir=image_dir, duplicate_map=duplicates_test_train, filename=list(duplicates_test_train.keys())[0])\n
"},{"location":"handlers/metrics/classification/","title":"Classification","text":""},{"location":"handlers/metrics/classification/#classification_metrics","title":"classification_metrics","text":"
def classification_metrics(ground_truth, retrieved)\n

Given ground truth dictionary and retrieved dictionary, return per class precision, recall and f1 score. Class 1 is assigned to duplicate file pairs while class 0 is for non-duplicate file pairs.

"},{"location":"handlers/metrics/classification/#args","title":"Args","text":"
  • ground_truth: A dictionary representing ground truth with filenames as key and a list of duplicate filenames

  • retrieved: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved

"},{"location":"handlers/metrics/classification/#returns","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/","title":"Information retrieval","text":""},{"location":"handlers/metrics/information_retrieval/#avg_prec","title":"avg_prec","text":"
def avg_prec(correct_duplicates, retrieved_duplicates)\n

Get average precision(AP) for a single query given correct and retrieved file names.

"},{"location":"handlers/metrics/information_retrieval/#args","title":"Args","text":"
  • correct_duplicates: List of correct duplicates i.e., ground truth)

  • retrieved_duplicates: List of retrieved duplicates for one single query

"},{"location":"handlers/metrics/information_retrieval/#returns","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#ndcg","title":"ndcg","text":"
def ndcg(correct_duplicates, retrieved_duplicates)\n

Get Normalized discounted cumulative gain(NDCG) for a single query given correct and retrieved file names.

"},{"location":"handlers/metrics/information_retrieval/#args_1","title":"Args","text":"
  • correct_duplicates: List of correct duplicates i.e., ground truth)

  • retrieved_duplicates: List of retrieved duplicates for one single query

"},{"location":"handlers/metrics/information_retrieval/#returns_1","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#jaccard_similarity","title":"jaccard_similarity","text":"
def jaccard_similarity(correct_duplicates, retrieved_duplicates)\n

Get jaccard similarity for a single query given correct and retrieved file names.

"},{"location":"handlers/metrics/information_retrieval/#args_2","title":"Args","text":"
  • correct_duplicates: List of correct duplicates i.e., ground truth)

  • retrieved_duplicates: List of retrieved duplicates for one single query

"},{"location":"handlers/metrics/information_retrieval/#returns_2","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#mean_metric","title":"mean_metric","text":"
def mean_metric(ground_truth, retrieved, metric)\n

Get mean of specified metric.

"},{"location":"handlers/metrics/information_retrieval/#args_3","title":"Args","text":"
  • metric_func: metric function on which mean is to be calculated across all queries
"},{"location":"handlers/metrics/information_retrieval/#returns_3","title":"Returns","text":""},{"location":"handlers/metrics/information_retrieval/#get_all_metrics","title":"get_all_metrics","text":"
def get_all_metrics(ground_truth, retrieved)\n

Get mean of all information retrieval metrics across all queries.

"},{"location":"handlers/metrics/information_retrieval/#args_4","title":"Args","text":"
  • ground_truth: A dictionary representing ground truth with filenames as key and a list of duplicate filenames

  • retrieved: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved

"},{"location":"handlers/metrics/information_retrieval/#returns_4","title":"Returns","text":""},{"location":"handlers/search/bktree/","title":"Bktree","text":""},{"location":"handlers/search/bktree/#class-bktreenode","title":"class BkTreeNode","text":"

Class to contain the attributes of a single node in the BKTree.

"},{"location":"handlers/search/bktree/#__init__","title":"__init__","text":"
def __init__(node_name, node_value, parent_name)\n
"},{"location":"handlers/search/bktree/#class-bktree","title":"class BKTree","text":"

Class to construct and perform search using a BKTree.

"},{"location":"handlers/search/bktree/#__init___1","title":"__init__","text":"
def __init__(hash_dict, distance_function)\n

Initialize a root for the BKTree and triggers the tree construction using the dictionary for mapping file names and corresponding hashes.

"},{"location":"handlers/search/bktree/#args","title":"Args","text":"
  • hash_dict: Dictionary mapping file names to corresponding hash strings {filename: hash}

  • distance_function: A function for calculating distance between the hashes.

"},{"location":"handlers/search/bktree/#construct_tree","title":"construct_tree","text":"
def construct_tree()\n

Construct the BKTree.

"},{"location":"handlers/search/bktree/#search","title":"search","text":"
def search(query, tol)\n

Function to search the bktree given a hash of the query image.

"},{"location":"handlers/search/bktree/#args_1","title":"Args","text":"
  • query: hash string for which BKTree needs to be searched.

  • tol: distance upto which duplicate is valid.

"},{"location":"handlers/search/bktree/#returns","title":"Returns","text":"
  • List of tuples of the form [(valid_retrieval_filename1: distance), (valid_retrieval_filename2: distance)]
"},{"location":"handlers/search/brute_force/","title":"Brute force","text":""},{"location":"handlers/search/brute_force/#class-bruteforce","title":"class BruteForce","text":"

Class to perform search using a Brute force.

"},{"location":"handlers/search/brute_force/#__init__","title":"__init__","text":"
def __init__(hash_dict, distance_function)\n

Initialize a dictionary for mapping file names and corresponding hashes and a distance function to be used for getting distance between two hash strings.

"},{"location":"handlers/search/brute_force/#args","title":"Args","text":"
  • hash_dict: Dictionary mapping file names to corresponding hash strings {filename: hash}

  • distance_function: A function for calculating distance between the hashes.

"},{"location":"handlers/search/brute_force/#search","title":"search","text":"
def search(query, tol)\n

Function for searching using brute force.

"},{"location":"handlers/search/brute_force/#args_1","title":"Args","text":"
  • query: hash string for which brute force needs to work.

  • tol: distance upto which duplicate is valid.

"},{"location":"handlers/search/brute_force/#returns","title":"Returns","text":"
  • List of tuples of the form [(valid_retrieval_filename1: distance), (valid_retrieval_filename2: distance)]
"},{"location":"handlers/search/brute_force_cython/","title":"Brute force cython","text":""},{"location":"handlers/search/brute_force_cython/#class-bruteforcecython","title":"class BruteForceCython","text":"

Class to perform search using a Brute force.

"},{"location":"handlers/search/brute_force_cython/#__init__","title":"__init__","text":"
def __init__(hash_dict, distance_function)\n

Initialize a dictionary for mapping file names and corresponding hashes and a distance function to be used for getting distance between two hash strings.

"},{"location":"handlers/search/brute_force_cython/#args","title":"Args","text":"
  • hash_dict: Dictionary mapping file names to corresponding hash strings {filename: hash}

  • distance_function: A function for calculating distance between the hashes.

"},{"location":"handlers/search/brute_force_cython/#search","title":"search","text":"
def search(query, tol)\n

Function for searching using brute force.

"},{"location":"handlers/search/brute_force_cython/#args_1","title":"Args","text":"
  • query: hash string for which brute force needs to work.

  • tol: distance upto which duplicate is valid.

"},{"location":"handlers/search/brute_force_cython/#returns","title":"Returns","text":"
  • List of tuples of the form [(valid_retrieval_filename1: distance), (valid_retrieval_filename2: distance)]
"},{"location":"handlers/search/retrieval/","title":"Retrieval","text":""},{"location":"handlers/search/retrieval/#cosine_similarity_chunk","title":"cosine_similarity_chunk","text":"
def cosine_similarity_chunk(t)\n
"},{"location":"handlers/search/retrieval/#get_cosine_similarity","title":"get_cosine_similarity","text":"
def get_cosine_similarity(X, verbose, chunk_size, threshold, num_workers)\n
"},{"location":"handlers/search/retrieval/#class-hasheval","title":"class HashEval","text":""},{"location":"handlers/search/retrieval/#__init__","title":"__init__","text":"
def __init__(test, queries, distance_function, verbose, threshold, search_method, num_dist_workers)\n

Initialize a HashEval object which offers an interface to control hashing and search methods for desired dataset. Compute a map of duplicate images in the document space given certain input control parameters.

"},{"location":"handlers/search/retrieval/#retrieve_results","title":"retrieve_results","text":"
def retrieve_results(scores)\n

Return results with or without scores.

"},{"location":"handlers/search/retrieval/#args","title":"Args","text":"
  • scores: Boolean indicating whether results are to eb returned with or without scores.
"},{"location":"handlers/search/retrieval/#returns","title":"Returns","text":"
  • if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg',

  • score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}

  • if scores is False, then a dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg',

  • 'image1_duplicate2.jpg'], 'image2.jpg': ['image1_duplicate1.jpg',..], ..}

"},{"location":"methods/cnn/","title":"CNN","text":""},{"location":"methods/cnn/#class-cnn","title":"class CNN","text":"

Find duplicates using CNN and/or generate CNN encodings given a single image or a directory of images.

The module can be used for 2 purposes: Encoding generation and duplicate detection.

  • Encodings generation: To propagate an image through a Convolutional Neural Network architecture and generate encodings. The generated encodings can be used at a later time for deduplication. Using the method 'encode_image', the CNN encodings for a single image can be obtained while the 'encode_images' method can be used to get encodings for all images in a directory.

  • Duplicate detection: Find duplicates either using the encoding mapping generated previously using 'encode_images' or using a Path to the directory that contains the images that need to be deduplicated. 'find_duplicates' and 'find_duplicates_to_remove' methods are provided to accomplish these tasks.

"},{"location":"methods/cnn/#__init__","title":"__init__","text":"
def __init__(verbose, model_config)\n

Initialize a pytorch MobileNet model v3 that is sliced at the last convolutional layer. Set the batch size for pytorch dataloader to be 64 samples.

"},{"location":"methods/cnn/#args","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.

  • model_config: A CustomModel that can be used to initialize a custom PyTorch model along with the corresponding transform.

"},{"location":"methods/cnn/#apply_preprocess","title":"apply_preprocess","text":"
def apply_preprocess(im_arr)\n

Apply preprocessing function for mobilenet to images.

"},{"location":"methods/cnn/#args_1","title":"Args","text":"
  • im_arr: Image typecast to numpy array.
"},{"location":"methods/cnn/#returns","title":"Returns","text":"
  • transformed_image_tensor: Transformed images returned as a pytorch tensor.
"},{"location":"methods/cnn/#encode_image","title":"encode_image","text":"
def encode_image(image_file, image_array)\n

Generate CNN encoding for a single image.

"},{"location":"methods/cnn/#args_2","title":"Args","text":"
  • image_file: Path to the image file.

  • image_array: Optional, used instead of image_file. Image typecast to numpy array.

"},{"location":"methods/cnn/#returns_1","title":"Returns","text":"
  • encoding: Encodings for the image in the form of numpy array.
"},{"location":"methods/cnn/#example-usage","title":"Example usage","text":"
from imagededup.methods import CNN\nmyencoder = CNN()\nencoding = myencoder.encode_image(image_file='path/to/image.jpg')\nOR\nencoding = myencoder.encode_image(image_array=<numpy array of image>)\n
"},{"location":"methods/cnn/#encode_images","title":"encode_images","text":"
def encode_images(image_dir, recursive, num_enc_workers)\n

Generate CNN encodings for all images in a given directory of images. Test.

"},{"location":"methods/cnn/#args_3","title":"Args","text":"
  • image_dir: Path to the image directory.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.

"},{"location":"methods/cnn/#returns_2","title":"Returns","text":"
  • dictionary: Contains a mapping of filenames and corresponding numpy array of CNN encodings.
"},{"location":"methods/cnn/#example-usage_1","title":"Example usage","text":"
from imagededup.methods import CNN\nmyencoder = CNN()\nencoding_map = myencoder.encode_images(image_dir='path/to/image/directory')\n
"},{"location":"methods/cnn/#find_duplicates","title":"find_duplicates","text":"
def find_duplicates(image_dir, encoding_map, min_similarity_threshold, scores, outfile, recursive, num_enc_workers, num_sim_workers)\n

Find duplicates for each file. Take in path of the directory or encoding dictionary in which duplicates are to be detected above the given threshold. Return dictionary containing key as filename and value as a list of duplicate file names. Optionally, the cosine distances could be returned instead of just duplicate filenames for each query file.

"},{"location":"methods/cnn/#args_4","title":"Args","text":"
  • image_dir: Path to the directory containing all the images or dictionary with keys as file names

  • encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding CNN encodings.

  • min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9

  • scores: Optional, boolean indicating whether similarity scores are to be returned along with retrieved duplicates.

  • outfile: Optional, name of the file to save the results, must be a json. Default is None.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.

  • num_sim_workers: Optional, number of cpu cores to use for multiprocessing similarity computation, set to number of CPUs in the system by default. 0 disables multiprocessing.

"},{"location":"methods/cnn/#returns_3","title":"Returns","text":"
  • dictionary: if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg', score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}. if scores is False, then a dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg', 'image1_duplicate2.jpg'], 'image2.jpg':['image1_duplicate1.jpg',..], ..}
"},{"location":"methods/cnn/#example-usage_2","title":"Example usage","text":"
from imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates(image_dir='path/to/directory', min_similarity_threshold=0.85, scores=True,\noutfile='results.json')\n\nOR\n\nfrom imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates(encoding_map=<mapping filename to cnn encodings>,\nmin_similarity_threshold=0.85, scores=True, outfile='results.json')\n
"},{"location":"methods/cnn/#find_duplicates_to_remove","title":"find_duplicates_to_remove","text":"
def find_duplicates_to_remove(image_dir, encoding_map, min_similarity_threshold, outfile, recursive, num_enc_workers, num_sim_workers)\n

Give out a list of image file names to remove based on the similarity threshold. Does not remove the mentioned files.

"},{"location":"methods/cnn/#args_5","title":"Args","text":"
  • image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as numpy arrays which represent the CNN encoding for the key image file.

  • encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding CNN encodings.

  • min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9

  • outfile: Optional, name of the file to save the results, must be a json. Default is None.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing.

  • num_sim_workers: Optional, number of cpu cores to use for multiprocessing similarity computation, set to number of CPUs in the system by default. 0 disables multiprocessing.

"},{"location":"methods/cnn/#returns_4","title":"Returns","text":"
  • duplicates: List of image file names that should be removed.
"},{"location":"methods/cnn/#example-usage_3","title":"Example usage","text":"
from imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'),\nmin_similarity_threshold=0.85)\n\nOR\n\nfrom imagededup.methods import CNN\nmyencoder = CNN()\nduplicates = myencoder.find_duplicates_to_remove(encoding_map=<mapping filename to cnn encodings>,\nmin_similarity_threshold=0.85, outfile='results.json')\n
"},{"location":"methods/hashing/","title":"Hashing","text":""},{"location":"methods/hashing/#class-hashing","title":"class Hashing","text":"

Find duplicates using hashing algorithms and/or generate hashes given a single image or a directory of images.

The module can be used for 2 purposes: Encoding generation and duplicate detection.

  • Encoding generation: To generate hashes using specific hashing method. The generated hashes can be used at a later time for deduplication. Using the method 'encode_image' from the specific hashing method object, the hash for a single image can be obtained while the 'encode_images' method can be used to get hashes for all images in a directory.

  • Duplicate detection: Find duplicates either using the encoding mapping generated previously using 'encode_images' or using a Path to the directory that contains the images that need to be deduplicated. 'find_duplicates' and 'find_duplicates_to_remove' methods are provided to accomplish these tasks.

"},{"location":"methods/hashing/#__init__","title":"__init__","text":"
def __init__(verbose)\n

Initialize hashing class.

"},{"location":"methods/hashing/#args","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.
"},{"location":"methods/hashing/#hamming_distance","title":"hamming_distance","text":"
def hamming_distance(hash1, hash2)\n

Calculate the hamming distance between two hashes. If length of hashes is not 64 bits, then pads the length to be 64 for each hash and then calculates the hamming distance.

"},{"location":"methods/hashing/#args_1","title":"Args","text":"
  • hash1: hash string

  • hash2: hash string

"},{"location":"methods/hashing/#returns","title":"Returns","text":"
  • hamming_distance: Hamming distance between the two hashes.
"},{"location":"methods/hashing/#encode_image","title":"encode_image","text":"
def encode_image(image_file, image_array)\n

Generate hash for a single image.

"},{"location":"methods/hashing/#args_2","title":"Args","text":"
  • image_file: Path to the image file.

  • image_array: Optional, used instead of image_file. Image typecast to numpy array.

"},{"location":"methods/hashing/#returns_1","title":"Returns","text":"
  • hash: A 16 character hexadecimal string hash for the image.
"},{"location":"methods/hashing/#example-usage","title":"Example usage","text":"
from imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nmyhash = myencoder.encode_image(image_file='path/to/image.jpg')\nOR\nmyhash = myencoder.encode_image(image_array=<numpy array of image>)\n
"},{"location":"methods/hashing/#encode_images","title":"encode_images","text":"
def encode_images(image_dir, recursive, num_enc_workers)\n

Generate hashes for all images in a given directory of images.

"},{"location":"methods/hashing/#args_3","title":"Args","text":"
  • image_dir: Path to the image directory.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.

"},{"location":"methods/hashing/#returns_2","title":"Returns","text":"
  • dictionary: A dictionary that contains a mapping of filenames and corresponding 64 character hash string such as {'Image1.jpg': 'hash_string1', 'Image2.jpg': 'hash_string2', ...}
"},{"location":"methods/hashing/#example-usage_1","title":"Example usage","text":"
from imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nmapping = myencoder.encode_images('path/to/directory')\n
"},{"location":"methods/hashing/#find_duplicates","title":"find_duplicates","text":"
def find_duplicates(image_dir, encoding_map, max_distance_threshold, scores, outfile, search_method, recursive, num_enc_workers, num_dist_workers)\n

Find duplicates for each file. Takes in path of the directory or encoding dictionary in which duplicates are to be detected. All images with hamming distance less than or equal to the max_distance_threshold are regarded as duplicates. Returns dictionary containing key as filename and value as a list of duplicate file names. Optionally, the below the given hamming distance could be returned instead of just duplicate filenames for each query file.

"},{"location":"methods/hashing/#args_4","title":"Args","text":"
  • image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as hash strings for the key image file.

  • encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding hashes.

  • max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). Default is 10.

  • scores: Optional, boolean indicating whether Hamming distances are to be returned along with retrieved duplicates.

  • outfile: Optional, name of the file to save the results, must be a json. Default is None.

  • search_method: Algorithm used to retrieve duplicates. Default is brute_force_cython for Unix else bktree.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.

  • num_dist_workers: Optional, number of cpu cores to use for multiprocessing distance computation, set to number of CPUs in the system by default. 0 disables multiprocessing.

"},{"location":"methods/hashing/#returns_3","title":"Returns","text":"
  • duplicates dictionary: if scores is True, then a dictionary of the form {'image1.jpg': [('image1_duplicate1.jpg', score), ('image1_duplicate2.jpg', score)], 'image2.jpg': [] ..}. if scores is False, then a dictionary of the form {'image1.jpg': ['image1_duplicate1.jpg', 'image1_duplicate2.jpg'], 'image2.jpg':['image1_duplicate1.jpg',..], ..}
"},{"location":"methods/hashing/#example-usage_2","title":"Example usage","text":"
from imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nduplicates = myencoder.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True,\noutfile='results.json')\n\nOR\n\nfrom imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nduplicates = myencoder.find_duplicates(encoding_map=<mapping filename to hashes>,\nmax_distance_threshold=15, scores=True, outfile='results.json')\n
"},{"location":"methods/hashing/#find_duplicates_to_remove","title":"find_duplicates_to_remove","text":"
def find_duplicates_to_remove(image_dir, encoding_map, max_distance_threshold, outfile, recursive, num_enc_workers, num_dist_workers)\n

Give out a list of image file names to remove based on the hamming distance threshold threshold. Does not remove the mentioned files.

"},{"location":"methods/hashing/#args_5","title":"Args","text":"
  • image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as hash strings for the key image file.

  • encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and corresponding hashes.

  • max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). Default is 10.

  • outfile: Optional, name of the file to save the results, must be a json. Default is None.

  • recursive: Optional, find images recursively in a nested image directory structure, set to False by default.

  • num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation, set to number of CPUs in the system by default. 0 disables multiprocessing.

  • num_dist_workers: Optional, number of cpu cores to use for multiprocessing distance computation, set to number of CPUs in the system by default. 0 disables multiprocessing.

"},{"location":"methods/hashing/#returns_4","title":"Returns","text":"
  • duplicates: List of image file names that are found to be duplicate of me other file in the directory.
"},{"location":"methods/hashing/#example-usage_3","title":"Example usage","text":"
from imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nduplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'),\nmax_distance_threshold=15)\n\nOR\n\nfrom imagededup.methods import <hash-method>\nmyencoder = <hash-method>()\nduplicates = myencoder.find_duplicates(encoding_map=<mapping filename to hashes>,\nmax_distance_threshold=15, outfile='results.json')\n
"},{"location":"methods/hashing/#class-phash","title":"class PHash","text":"

Inherits from Hashing base class and implements perceptual hashing (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html).

Offers all the functionality mentioned in hashing class.

"},{"location":"methods/hashing/#example-usage_4","title":"Example usage","text":"
# Perceptual hash for images\nfrom imagededup.methods import PHash\nphasher = PHash()\nperceptual_hash = phasher.encode_image(image_file = 'path/to/image.jpg')\nOR\nperceptual_hash = phasher.encode_image(image_array = <numpy image array>)\nOR\nperceptual_hashes = phasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = phasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import PHash\nphasher = PHash()\nfiles_to_remove = phasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = phasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n
"},{"location":"methods/hashing/#__init___1","title":"__init__","text":"
def __init__(verbose)\n

Initialize perceptual hashing class.

"},{"location":"methods/hashing/#args_6","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.
"},{"location":"methods/hashing/#class-ahash","title":"class AHash","text":"

Inherits from Hashing base class and implements average hashing. (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)

Offers all the functionality mentioned in hashing class.

"},{"location":"methods/hashing/#example-usage_5","title":"Example usage","text":"
# Average hash for images\nfrom imagededup.methods import AHash\nahasher = AHash()\naverage_hash = ahasher.encode_image(image_file = 'path/to/image.jpg')\nOR\naverage_hash = ahasher.encode_image(image_array = <numpy image array>)\nOR\naverage_hashes = ahasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import AHash\nahasher = AHash()\nduplicates = ahasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = ahasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import AHash\nahasher = AHash()\nfiles_to_remove = ahasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = ahasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n
"},{"location":"methods/hashing/#__init___2","title":"__init__","text":"
def __init__(verbose)\n

Initialize average hashing class.

"},{"location":"methods/hashing/#args_7","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.
"},{"location":"methods/hashing/#class-dhash","title":"class DHash","text":"

Inherits from Hashing base class and implements difference hashing. (Implementation reference: http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)

Offers all the functionality mentioned in hashing class.

"},{"location":"methods/hashing/#example-usage_6","title":"Example usage","text":"
# Difference hash for images\nfrom imagededup.methods import DHash\ndhasher = DHash()\ndifference_hash = dhasher.encode_image(image_file = 'path/to/image.jpg')\nOR\ndifference_hash = dhasher.encode_image(image_array = <numpy image array>)\nOR\ndifference_hashes = dhasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import DHash\ndhasher = DHash()\nduplicates = dhasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = dhasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import DHash\ndhasher = DHash()\nfiles_to_remove = dhasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = dhasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n
"},{"location":"methods/hashing/#__init___3","title":"__init__","text":"
def __init__(verbose)\n

Initialize difference hashing class.

"},{"location":"methods/hashing/#args_8","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.
"},{"location":"methods/hashing/#class-whash","title":"class WHash","text":"

Inherits from Hashing base class and implements wavelet hashing. (Implementation reference: https://fullstackml.com/wavelet-image-hash-in-python-3504fdd282b5)

Offers all the functionality mentioned in hashing class.

"},{"location":"methods/hashing/#example-usage_7","title":"Example usage","text":"
# Wavelet hash for images\nfrom imagededup.methods import WHash\nwhasher = WHash()\nwavelet_hash = whasher.encode_image(image_file = 'path/to/image.jpg')\nOR\nwavelet_hash = whasher.encode_image(image_array = <numpy image array>)\nOR\nwavelet_hashes = whasher.encode_images(image_dir = 'path/to/directory')  # for a directory of images\n\n# Finding duplicates:\nfrom imagededup.methods import WHash\nwhasher = WHash()\nduplicates = whasher.find_duplicates(image_dir='path/to/directory', max_distance_threshold=15, scores=True)\nOR\nduplicates = whasher.find_duplicates(encoding_map=encoding_map, max_distance_threshold=15, scores=True)\n\n# Finding duplicates to return a single list of duplicates in the image collection\nfrom imagededup.methods import WHash\nwhasher = WHash()\nfiles_to_remove = whasher.find_duplicates_to_remove(image_dir='path/to/images/directory',\nmax_distance_threshold=15)\nOR\nfiles_to_remove = whasher.find_duplicates_to_remove(encoding_map=encoding_map, max_distance_threshold=15)\n
"},{"location":"methods/hashing/#__init___4","title":"__init__","text":"
def __init__(verbose)\n

Initialize wavelet hashing class.

"},{"location":"methods/hashing/#args_9","title":"Args","text":"
  • verbose: Display progress bar if True else disable it. Default value is True.
"},{"location":"user_guide/benchmarks/","title":"Benchmarks","text":"

To gauge an idea of the speed and accuracy of the implemented algorithms, a benchmark has been provided on the UKBench dataset (zip file titled 'UKBench image collection' having size ~1.5G) and some variations derived from it.

"},{"location":"user_guide/benchmarks/#datasets","title":"Datasets","text":"

3 datasets that have been used:

  1. Near duplicate dataset (UKBench dataset): This dataset has near duplicates that are arranged in groups of 4. There are a total of 2550 such groups amounting to a total of 10200 RGB images. The size of each image is 640 x 480 with jpg extension. The image below depicts 3 example groups from the UKBench dataset. Each row represents a group with the corresponding 4 images from the group.

  2. Transformed dataset derived from UKBench dataset: An image from different groups of the UKBench dataset was taken and the following 5 transformations were applied to the original image:

    • Random crop preserving the original aspect ratio (new size - 560 x 420)
    • Horizontal flip
    • Vertical flip
    • 25 degree rotation
    • Resizing with change in aspect ratio (new aspect ratio - 1:1)

    Thus, each group has a total of 6 images (original + transformed). A total of 1800 such groups were created totalling 10800 images in the dataset.

  3. Exact duplicate dataset: An image from each of the 2550 image groups of the UKBench dataset was taken and an exact duplicate was created. The number of images totalled 5100.

"},{"location":"user_guide/benchmarks/#environment","title":"Environment","text":"

The benchmarks were performed on an AWS ec2 r5.xlarge instance having 4 vCPUs and 32 GB memory. The instance does not have a GPU, so all the runs are done on CPUs.

"},{"location":"user_guide/benchmarks/#metrics","title":"Metrics","text":"

The metrics used here are classification metrics as explained in the documentation.

class-0 refers to non-duplicate image pairs.

class-1 refers to duplicate image pairs.

The reported numbers are rounded off to nearest 3 digits.

"},{"location":"user_guide/benchmarks/#timings","title":"Timings","text":"

The times are reported in seconds and comprise the time taken to generate encodings and find duplicates. The time taken to perform the evaluation task is NOT reported.

"},{"location":"user_guide/benchmarks/#threshold-selection","title":"Threshold selection","text":"

For each method, 3 different thresholds have been selected.

For hashing methods, following max_distance_threshold values are used:

  • 0: Indicates that exactly the same hash should be generated for the image pairs to be considered duplicates.
  • 10: Default.
  • 32: Halfway between the maximum and minimum values (0 and 64).

For cnn method, following min_similarity_threshold values are used:

  • 1.0: Indicates that exactly the same cnn embeddings should be generated for the image pairs to be considered duplicates.
  • 0.9: Default.
  • 0.5: A threshold that allows large deviation between image pairs.
"},{"location":"user_guide/benchmarks/#results","title":"Results","text":""},{"location":"user_guide/benchmarks/#near-duplicate-dataset","title":"Near Duplicate dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 35.570 0.999 0.0 1.0 0.0 dhash 10 35.810 0.999 0.018 0.999 0.0461 dhash 32 106.670 0.998 0.0 0.326 0.884 phash 0 40.073 0.999 1.0 1.0 0.0 phash 10 39.056 0.999 0.498 0.999 0.016 phash 32 98.835 0.998 0.0 0.343 0.856 ahash 0 36.171 0.999 0.282 0.999 0.002 ahash 10 36.560 0.999 0.012 0.996 0.193 ahash 32 97.170 0.999 0.000 0.448 0.932 whash 0 51.710 0.999 0.112 0.999 0.002 whash 10 51.940 0.999 0.008 0.993 0.199 whash 32 112.560 0.999 0.0 0.416 0.933 cnn 0.5 379.680 0.999 0.0 0.856 0.999 cnn 0.9 377.157 0.999 0.995 0.999 0.127 cnn 1.0 379.570 0.999 0.0 1.0 0.0"},{"location":"user_guide/benchmarks/#observations","title":"Observations","text":"
  • The cnn method with a threshold between 0.5 and 0.9 would work best for finding near duplicates. This is indicated by the extreme values class-1 precision and recall takes for the two thresholds.
  • Hashing methods do not perform well for finding near duplicates.
"},{"location":"user_guide/benchmarks/#transformed-dataset","title":"Transformed dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 25.360 0.999 1.0 1.0 0.040 dhash 10 25.309 0.999 0.138 0.999 0.117 dhash 32 108.960 0.990 0.0 0.336 0.872 phash 0 28.069 0.999 1.0 1.0 0.050 phash 10 28.075 0.999 0.341 0.999 0.079 phash 32 107.079 0.990 0.003 0.328 0.847 ahash 0 25.270 0.999 0.961 0.999 0.058 ahash 10 25.389 0.999 0.035 0.997 0.216 ahash 32 93.084 0.990 0.0 0.441 0.849 whash 0 40.390 0.999 0.917 0.999 0.061 whash 10 41.260 0.999 0.023 0.996 0.203 whash 32 109.630 0.990 0.0 0.410 0.853 cnn 0.5 397.380 0.999 0.003 0.852 0.999 cnn 0.9 392.090 0.999 0.999 0.990 0.384 cnn 1.0 396.250 0.990 0.0 1.0 0.0"},{"location":"user_guide/benchmarks/#observations_1","title":"Observations","text":"
  • The cnn method with threshold 0.9 seems to work best for finding transformed duplicates. A slightly lower min_similarity_threshold value could lead to a higher class-1 recall.
  • Hashing methods do not perform well for finding transformed duplicates. In reality, resized images get found easily, but all other transformations lead to a bad performance for hashing methods.
"},{"location":"user_guide/benchmarks/#exact-duplicates-dataset","title":"Exact duplicates dataset","text":"Method Threshold Time (s) class-0 precision class-1 precision class-0 recall class-1 recall dhash 0 18.380 1.0 1.0 1.0 1.0 dhash 10 18.410 1.0 0.223 0.999 1.0 dhash 32 34.602 1.0 0.0 0.327 1.0 phash 0 19.780 1.0 1.0 1.0 1.0 phash 10 20.012 1.0 0.980 0.999 1.0 phash 32 34.054 1.0 0.0 0.344 1.0 ahash 0 18.180 1.0 0.998 0.999 1.0 ahash 10 18.228 1.0 0.044 0.995 1.0 ahash 32 31.961 1.0 0.0 0.448 1.0 whash 0 26.097 1.0 0.980 0.999 1.0 whash 10 26.056 1.0 0.029 0.993 1.0 whash 32 39.408 1.0 0.0 0.417 1.0 cnn 0.5 192.050 1.0 0.001 0.860 1.0 cnn 0.9 191.024 1.0 1.0 1.0 1.0 cnn 1.0 194.270 0.999 1.0 1.0 0.580*

* The value is low as opposed to the expected 1.0 because of the cosine_similarity function from scikit-learn (used within the package) which sometimes calculates the similarity to be slightly less than 1.0 even when the vectors are same.

"},{"location":"user_guide/benchmarks/#observations_2","title":"Observations","text":"
  • Difference hashing is the fastest (max_distance_threshold 0).
  • When using hashing methods for exact duplicates, keep max_distance_threshold to a low value. The value of 0 is good, but a slightly higher value should also work fine.
  • When using cnn method, keep min_similarity_threshold to a high value. The default value of 0.9 seems to work well. A slightly higher value can also be used.
"},{"location":"user_guide/benchmarks/#summary","title":"Summary","text":"
  • Near duplicate dataset: use cnn with an appropriate min_similarity_threshold.
  • Transformed dataset: use cnn with min_similarity_threshold of around 0.9 (default).
  • Exact duplicates dataset: use Difference hashing with 0 max_distance_threshold.
  • A higher max_distance_threshold (i.e., hashing) leads to a higher execution time. cnn method doesn't seem much affected by the min_similarity_threshold (though a lower value would add a few seconds to the execution time as can be seen in all the runs above.)
  • Generally speaking, the cnn method takes longer to run as compared to hashing methods for all datasets. If a GPU is available, cnn method should be much faster.
"},{"location":"user_guide/custom_model/","title":"Using custom models for CNN","text":"

To allow users to use custom models for encoding generation, we provide a CustomModel construct which serves as a wrapper for a user-defined feature extractor. The CustomModel consists of the following attributes:

  • name: The name of the custom model. Can be set to any string.
  • model: A PyTorch model object, which is a subclass of torch.nn.Module and implements the forward method. The output of the forward method should be a tensor of shape (batch_size x features) . Alternatively, a __call__ method is also accepted.
  • transform: A function that transforms a PIL.Image object into a PyTorch tensor. Should correspond to the preprocessing logic of the supplied model.

CustomModel is provided while initializing the cnn object and can be used in the following 2 scenarios:

  1. Using the models provided with the imagededup package. There are 3 models provided currently:
    • MobileNetV3 (MobileNetV3 Small)- This is the default.
    • ViT (Vision Transformer- B16 IMAGENET1K_SWAG_E2E_V1)
    • EfficientNet (EfficientNet B4- IMAGENET1K_V1)
from imagededup.methods import CNN\n\n# Get CustomModel construct\nfrom imagededup.utils import CustomModel\n\n# Get the prepackaged models from imagededup\nfrom imagededup.utils.models import ViT, MobilenetV3, EfficientNet\n\n\n# Declare a custom config with CustomModel, the prepackaged models come with a name and transform function\ncustom_config = CustomModel(name=EfficientNet.name,\n                            model=EfficientNet(), \n                            transform=EfficientNet.transform)\n\n# Use model_config argument to pass the custom config\ncnn = CNN(model_config=custom_config)\n\n# Use the model as usual\n...\n

2.Using a user-defined custom model.

from imagededup.methods import CNN\n\n# Get CustomModel construct\nfrom imagededup.utils import CustomModel\n\n# Import necessary pytorch constructs for initializing a custom feature extractor\nimport torch\nfrom torchvision.transforms import transforms\n\n# Declare custom feature extractor class\nclass MyModel(torch.nn.Module):\n    transform = transforms.Compose(\n        [\n            transforms.ToTensor()\n        ]\n    )\n    name = 'my_custom_model'\n\n    def __init__(self):\n        super().__init__()\n        # Define the layers of the model here\n\n    def forward(self, x):\n        # Do something with x\n        return x\n\ncustom_config = CustomModel(name=MyModel.name,\n                            model=MyModel(),\n                            transform=MyModel.transform)\n\ncnn = CNN(model_config=custom_config)\n\n# Use the model as usual\n...\n

It is not necessary to bundle name and transform functions with the model class. They can be passed separately as well.

Examples for both scenarios can be found in the examples section.

"},{"location":"user_guide/encoding_generation/","title":"Encoding generation","text":"

It might be desirable to only generate the hashes/cnn encodings for a given image or all images in a directory instead of directly deduplicating using find_duplicates method. Encodings can be generated for a directory of images or for a single image:

  • Encoding generation for all images in a directory
  • Encoding generation for a single image
"},{"location":"user_guide/encoding_generation/#encoding-generation-for-all-images-in-a-directory","title":"Encoding generation for all images in a directory","text":"

To generate encodings for all images in an image directory encode_images function can be used. The general api for using encode_images is:

from imagededup.methods import <method-name>\nmethod_object = <method-name>()\nencodings = method_object.encode_images(image_dir='path/to/image/directory')\n

where the returned variable encodings is a dictionary mapping image file names to corresponding encoding:

{\n  'image1.jpg': <encoding-image-1>,\n  'image2.jpg': <encoding-image-2>,\n   ..\n}\n

For hashing algorithms, the encodings are 64 bit hashes represented as 16 character hexadecimal strings.

For cnn, the encodings are numpy array with shape (576,).

The 'method-name' corresponds to one of the deduplication methods available and can be set to:

  • PHash
  • AHash
  • DHash
  • WHash
  • CNN
"},{"location":"user_guide/encoding_generation/#options","title":"Options","text":"
  • image_dir: Path to the image directory for which encodings are to be generated.
  • recursive: finding images recursively in a nested directory structure, set to False by default.
"},{"location":"user_guide/encoding_generation/#considerations","title":"Considerations","text":"
  • If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, there is no entry for the image in the returned encodings dictionary.
  • Supported image formats: 'JPEG', 'PNG', 'BMP', 'MPO', 'PPM', 'TIFF', 'GIF', 'SVG', 'PGM', 'PBM', 'WEBP'.
"},{"location":"user_guide/encoding_generation/#examples","title":"Examples","text":"

Generating encodings using Difference hash:

from imagededup.methods import DHash\ndhasher = DHash()\nencodings = dhasher.encode_images(image_dir='path/to/image/directory')\n
"},{"location":"user_guide/encoding_generation/#encoding-generation-for-a-single-image","title":"Encoding generation for a single image","text":"

To generate encodings for a single image encode_image function can be used. The general api for using encode_image is:

from imagededup.methods import <method-name>\nmethod_object = <method-name>()\nencoding = method_object.encode_image(image_file='path/to/image/file')\n

where the returned variable encoding is either a hexadecimal string if a hashing method is used or a (576,) numpy array if cnn is used.

"},{"location":"user_guide/encoding_generation/#options_1","title":"Options","text":"
  • image_file: Optional, path to the image file for which encodings are to be generated.
  • image_array: Optional, used instead of image_file attribute. A numpy array representing the image.
"},{"location":"user_guide/encoding_generation/#considerations_1","title":"Considerations","text":"
  • If the image can't be loaded, no encodings are generated for the image and None is returned.
  • Supported image formats: 'JPEG', 'PNG', 'BMP', 'MPO', 'PPM', 'TIFF', 'GIF', 'SVG', 'PGM', 'PBM', 'WEBP'.
"},{"location":"user_guide/encoding_generation/#examples_1","title":"Examples","text":"

Generating encodings using Difference hash:

from imagededup.methods import DHash\ndhasher = DHash()\nencoding = dhasher.encode_image(image_file='path/to/image/file')\n
"},{"location":"user_guide/evaluating_performance/","title":"Evaluation of deduplication quality","text":"

To determine the quality of deduplication algorithm and the corresponding threshold, an evaluation framework is provided.

Given a ground truth mapping consisting of file names and a list of duplicates for each file along with a retrieved mapping from the deduplication algorithm for the same files, the following metrics can be obtained using the framework:

  • Mean Average Precision (MAP)
  • Mean Normalized Discounted Cumulative Gain (NDCG)
  • Jaccard Index
  • Per class Precision (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)
  • Per class Recall (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)
  • Per class f1-score (class 0 = non-duplicate image pairs, class 1 = duplicate image pairs)

The api for obtaining these metrics is as below:

from imagededup.evaluation import evaluate\nmetrics = evaluate(ground_truth_map, retrieved_map, metric='<metric-name>')\n

where the returned variable metrics is a dictionary containing the following content:

{\n  'map': <map>,\n  'ndcg': <mean ndcg>,\n  'jaccard': <mean jaccard index>,\n  'precision': <numpy array having per class precision>,\n  'recall': <numpy array having per class recall>,\n  'f1-score': <numpy array having per class f1-score>,\n  'support': <numpy array having per class support>\n}\n
"},{"location":"user_guide/evaluating_performance/#options","title":"Options","text":"
  • ground_truth_map: A dictionary representing ground truth with filenames as key and a list of duplicate filenames as value.
  • retrieved_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved duplicate filenames as value.
  • metric: Can take one of the following values:
    • 'map'
    • 'ndcg'
    • 'jaccard'
    • 'classification': Returns per class precision, recall, f1-score, support
    • 'all' (default, returns all the above metrics)
"},{"location":"user_guide/evaluating_performance/#considerations","title":"Considerations","text":"
  • Presently, the ground truth map should be prepared manually by the user. Symmetric relations between duplicates must be represented in the ground truth map. If an image i is a duplicate of image j, then j must also be represented as a duplicate of i. Absence of symmetric relations will lead to an exception.

  • Both the ground_truth_map and retrieved_map must have the same keys.

  • There is a difference between the way information retrieval metrics(map, ndcg, jaccard index) and classification metrics(precision, recall, f1-score) treat the symmetric relationships in duplicates. Consider the following ground_truth_map and retrieved_map:

ground_truth_map:

{\n  '1.jpg': ['2.jpg', '4.jpg'],\n  '2.jpg': ['1.jpg'],\n  '3.jpg': [],\n  '4.jpg': ['1.jpg']\n}\n

retrieved_map:

{\n  '1.jpg': ['2.jpg'],\n  '2.jpg': ['1.jpg'],\n  '3.jpg': [],\n  '4.jpg': []\n}\n

From the above, it can be seen that images '1.jpg' and '4.jpg' are not found to be duplicates of each other by the deduplication algorithm.

For calculating information retrieval metrics, each key in the maps is considered as an independent 'query'. In the ground truth, '4.jpg' is a duplicate of the key '1.jpg'. When it is not retrieved, it is considered a miss for query '1.jpg'. Similarly, '1.jpg' is a duplicate of the key '4.jpg' in the ground truth. When this is not retrieved, it is considered a miss for query '4.jpg'. Thus, the missing relationship is accounted for twice instead of just once.

Classification metrics, on the other hand, consider the relationships only once by forming unique pairs of images and labelling each pair as a 0 (non-duplicate image pair) and 1 (duplicate image pair).

Using the ground_truth_map, the ground truth pairs with the corresponding labels are:

Image Pair Label ('1.jpg', '2.jpg') 1 ('1.jpg', '3.jpg') 0 ('1.jpg', '4.jpg') 1 ('2.jpg', '3.jpg') 0 ('2.jpg', '4.jpg') 0 ('3.jpg', '4.jpg') 0

Similarly, using retrieved_map, the retrieved pairs are generated:

Image Pair Label ('1.jpg', '2.jpg') 1 ('1.jpg', '3.jpg') 0 ('1.jpg', '4.jpg') 0 ('2.jpg', '3.jpg') 0 ('2.jpg', '4.jpg') 0 ('3.jpg', '4.jpg') 0

These two sets of pairs are then used to calculate metrics such as precision/recall/f1-score. It can be seen that the missing relationship between pair ('1jpg', '4.jpg') is accounted for only once.

"},{"location":"user_guide/finding_duplicates/","title":"Finding duplicates","text":"

There are two methods available to find duplicates:

  • find_duplicates()
  • find_duplicates_to_remove()
"},{"location":"user_guide/finding_duplicates/#find_duplicates","title":"find_duplicates()","text":"

To find duplicates in an image directory, the general api is:

from imagededup.methods import <method-name>\nmethod_object = <method-name>()\nduplicates = method_object.find_duplicates(image_dir='path/to/image/directory',\n                                           <threshold-parameter-value>)\n

Duplicates can also be found if encodings of the images are available:

from imagededup.methods import <method-name>\nmethod_object = <method-name>()\nduplicates = method_object.find_duplicates(encoding_map,\n                                           <threshold-parameter-value>)\n

where the returned variable duplicates is a dictionary with the following content:

{\n  'image1.jpg': ['image1_duplicate1.jpg',\n                'image1_duplicate2.jpg'],\n  'image2.jpg': [..],\n  ..\n}\n

Each key in the duplicates dictionary corresponds to a file in the image directory passed to the image_dir parameter of the find_duplicates function. The value is a list of all file names in the image directory that were found to be duplicates for the key file. The 'method-name' corresponds to one of the deduplication methods available and can be set to:

  • PHash
  • AHash
  • DHash
  • WHash
  • CNN
"},{"location":"user_guide/finding_duplicates/#options","title":"Options","text":"
  • image_dir: Optional, directory where all image files are present.

  • encoding_map: Optional, used instead of image_dir attribute. Set it equal to the dictionary of file names and corresponding encodings (hashes/cnn encodings). The mentioned dictionary can be generated using the corresponding encode_images method.

  • scores: Setting it to True returns the scores representing the hamming distance (for hashing) or cosine similarity (for cnn) of each of the duplicate file names from the key file. In this case, the returned 'duplicates' dictionary has the following content:
{\n  'image1.jpg': [('image1_duplicate1.jpg', score),\n                 ('image1_duplicate2.jpg', score)],\n  'image2.jpg': [..],\n  ..\n}\n

Each key in the duplicates dictionary corresponds to a file in the image directory passed to the image_dir parameter of the find_duplicates function. The value is a list of tuples representing the file names and corresponding scores in the image directory that were found to be duplicates of the key file.

  • outfile: Name of file to which the returned duplicates dictionary is to be written, must be a json. None by default.

  • threshold parameter:

    • min_similarity_threshold for cnn method indicating the minimum amount of cosine similarity that should exist between the key image and a candidate image so that the candidate image can be considered as a duplicate of the key image. Should be a float between -1.0 and 1.0. Default value is 0.9.

    • max_distance_threshold for hashing methods indicating the maximum amount of hamming distance that can exist between the key image and a candidate image so that the candidate image can be considered as a duplicate of the key image. Should be an int between 0 and 64. Default value is 10.

  • recursive: finding images recursively in a nested directory structure, set to False by default.

"},{"location":"user_guide/finding_duplicates/#considerations","title":"Considerations","text":"
  • The returned duplicates dictionary contains symmetric relationships i.e., if an image i is a duplicate of image j, then image j must also be a duplicate of image i. Let's say that the image directory only consists of images i and j, then the duplicates dictionary would have the following content:
{\n  'i': ['j'],\n  'j': ['i']\n}\n
  • If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, the image is disregarded for deduplication and has no entry in the returned duplicates dictionary.
"},{"location":"user_guide/finding_duplicates/#examples","title":"Examples","text":"

To deduplicate an image directory using perceptual hashing, with a maximum allowed hamming distance of 12, scores returned along with duplicate filenames and the returned dictionary saved to file 'my_duplicates.json', use the following:

from imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates(image_dir='path/to/image/directory',\n                                     max_distance_threshold=12, \n                                     scores=True, \n                                     outfile='my_duplicates.json')\n

To deduplicate an image directory using cnn, with a minimum cosine similarity of 0.85, no scores returned and the returned dictionary saved to file 'my_duplicates.json', use the following:

from imagededup.methods import CNN\ncnn_encoder = CNN()\nduplicates = cnn_encoder.find_duplicates(image_dir='path/to/image/directory', \n                                         min_similarity_threshold=0.85, \n                                         scores=False, \n                                         outfile='my_duplicates.json')\n
"},{"location":"user_guide/finding_duplicates/#find_duplicates_to_remove","title":"find_duplicates_to_remove()","text":"

Returns a list of files in the image directory that are considered as duplicates. Does NOT remove the said files.

The api is similar to find_duplicates function (except the score attribute in find_duplicates). This function allows the return of a single list of file names in directory that are found to be duplicates. The general api for the method is as below:

from imagededup.methods import <method-name>\nmethod_object = <method-name>()\nduplicates = method_object.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                                     <threshold-parameter-value>)\nOR\n\nduplicates = method_object.find_duplicates_to_remove(encoding_map=encoding_map, \n                                                     <threshold-parameter-value>)\n

In this case, the returned variable duplicates is a list containing the name of image files that are found to be duplicates of some file in the directory:

[\n  'image1_duplicate1.jpg',\n  'image1_duplicate2.jpg'\n  ,..\n]\n

The 'method-name' corresponds to one of the deduplication methods available and can be set to:

  • PHash
  • AHash
  • DHash
  • WHash
  • CNN
"},{"location":"user_guide/finding_duplicates/#options_1","title":"Options","text":"
  • image_dir: Optional, directory where all image files are present.

  • encoding_map: Optional, used instead of image_dir attribute. Set it equal to the dictionary of file names and corresponding encodings (hashes/cnn encodings). The mentioned dictionary can be generated using the corresponding encode_images method.

  • outfile: Name of file to which the returned duplicates dictionary is to be written, must be a json. None by default.

  • threshold parameter:

    • min_similarity_threshold for cnn method indicating the minimum amount of cosine similarity that should exist between the key image and a candidate image so that the candidate image can be considered as a duplicate for the key image. Should be a float between -1.0 and 1.0. Default value is 0.9.

    • max_distance_threshold for hashing methods indicating the maximum amount of hamming distance that can exist between the key image and a candidate image so that the candidate image can be considered as a duplicate for the key image. Should be an int between 0 and 64. Default value is 10.

  • recursive: finding images recursively in a nested directory structure, set to False by default.

"},{"location":"user_guide/finding_duplicates/#considerations_1","title":"Considerations","text":"
  • This method must be used with caution. The symmetric nature of duplicates imposes an issue of marking one image as duplicate and the other as original. Consider the following duplicates dictionary:
{\n  '1.jpg': ['2.jpg'],\n  '2.jpg': ['1.jpg', '3.jpg'],\n  '3.jpg': ['2.jpg']\n}\n

In this case, it is possible to remove only 2.jpg which leaves 1.jpg and 3.jpg as non-duplicates of each other. However, it is also possible to remove both 1.jpg and 3.jpg leaving only 2.jpg. The find_duplicates_to_remove method can thus, return either of the outputs. In the above example, let's say that 1.jpg is retained, while its duplicate, 2.jpg, is marked as a duplicate. Once 2.jpg is marked as duplicate, its own found duplicates would be disregarded. Thus, 1.jpg and 3.jpg would not be considered as duplicates. So, the final return would be:

['2.jpg']\n

This leaves 1.jpg and 3.jpg as non-duplicates in the directory. If the user does not wish to impose this heuristic, it is advised to use find_duplicates function and use a custom heuristic to mark a file as duplicate.

  • If an image in the image directory can't be loaded, no encodings are generated for the image. Hence, the image is disregarded for deduplication and has no entry in the returned duplicates dictionary.
"},{"location":"user_guide/finding_duplicates/#examples_1","title":"Examples","text":"

To deduplicate an image directory using perceptual hashing, with a maximum allowed hamming distance of 12, and the returned list saved to file 'my_duplicates.json', use the following:

from imagededup.methods import PHash\nphasher = PHash()\nduplicates = phasher.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                               max_distance_threshold=12, \n                                               outfile='my_duplicates.json')\n

To deduplicate an image directory using cnn, with a minimum cosine similarity of 0.85 and the returned list saved to file 'my_duplicates.json', use the following:

from imagededup.methods import CNN\ncnn_encoder = CNN()\nduplicates = cnn_encoder.find_duplicates_to_remove(image_dir='path/to/image/directory', \n                                                   min_similarity_threshold=0.85, \n                                                   outfile='my_duplicates.json')\n
"},{"location":"user_guide/plotting_duplicates/","title":"Plotting duplicates of an image","text":"

Once a duplicate dictionary corresponding to an image directory has been obtained (using find_duplicates), duplicates for an image can be plotted using plot_duplicates method as below:

from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir, duplicate_map, filename, outfile=None)\n

where filename is the file for which duplicates are to be plotted.

"},{"location":"user_guide/plotting_duplicates/#options","title":"Options","text":"
  • image_dir: Directory where all image files are present.

  • duplicate_map: A dictionary representing retrieved duplicates with filenames as key and a list of retrieved duplicate filenames as value. A duplicate_map with scores can also be passed (obtained from find_duplicates function with scores attribute set to True).

  • filename: Image file name for which duplicates are to be plotted.

  • outfile: Optional, name of the file the plot should be saved to. None by default.

The output looks as below:

"},{"location":"utils/data_generator/","title":"Data generator","text":""},{"location":"utils/data_generator/#img_dataloader","title":"img_dataloader","text":"
def img_dataloader(image_dir, batch_size, basenet_preprocess, recursive, num_workers)\n
"},{"location":"utils/data_generator/#class-imgdataset","title":"class ImgDataset","text":""},{"location":"utils/data_generator/#__init__","title":"__init__","text":"
def __init__(image_dir, basenet_preprocess, recursive)\n
"},{"location":"utils/data_generator/#__len__","title":"__len__","text":"
def __len__()\n

Number of images.

"},{"location":"utils/data_generator/#__getitem__","title":"__getitem__","text":"
def __getitem__(item)\n
"},{"location":"utils/general_utils/","title":"General utils","text":""},{"location":"utils/general_utils/#get_files_to_remove","title":"get_files_to_remove","text":"
def get_files_to_remove(duplicates)\n

Get a list of files to remove.

"},{"location":"utils/general_utils/#args","title":"Args","text":"
  • duplicates: A dictionary with file name as key and a list of duplicate file names as value.
"},{"location":"utils/general_utils/#returns","title":"Returns","text":""},{"location":"utils/general_utils/#save_json","title":"save_json","text":"
def save_json(results, filename, float_scores)\n

Save results with a filename.

"},{"location":"utils/general_utils/#args_1","title":"Args","text":"
  • results: Dictionary of results to be saved.

  • filename: Name of the file to be saved.

  • float_scores: boolean to indicate if scores are floats.

"},{"location":"utils/general_utils/#parallelise","title":"parallelise","text":"
def parallelise(function, data, verbose, num_workers)\n
"},{"location":"utils/general_utils/#generate_files","title":"generate_files","text":"
def generate_files(image_dir, recursive)\n
"},{"location":"utils/general_utils/#generate_relative_names","title":"generate_relative_names","text":"
def generate_relative_names(image_dir, files)\n
"},{"location":"utils/image_utils/","title":"Image utils","text":""},{"location":"utils/image_utils/#check_image_array_hash","title":"check_image_array_hash","text":"
def check_image_array_hash(image_arr)\n

Checks the sanity of the input image numpy array for hashing functions.

"},{"location":"utils/image_utils/#args","title":"Args","text":"
  • image_arr: Image array.
"},{"location":"utils/image_utils/#expand_image_array_cnn","title":"expand_image_array_cnn","text":"
def expand_image_array_cnn(image_arr)\n

Checks the sanity of the input image numpy array for cnn and converts the grayscale numpy array to rgb by repeating the array thrice along the 3rd dimension if a 2-dimensional image array is provided.

"},{"location":"utils/image_utils/#args_1","title":"Args","text":"
  • image_arr: Image array.
"},{"location":"utils/image_utils/#returns","title":"Returns","text":""},{"location":"utils/image_utils/#preprocess_image","title":"preprocess_image","text":"
def preprocess_image(image, target_size, grayscale)\n

Take as input an image as numpy array or Pillow format. Returns an array version of optionally resized and grayed image.

"},{"location":"utils/image_utils/#args_2","title":"Args","text":"
  • image: numpy array or a pillow image.

  • target_size: Size to resize the input image to.

  • grayscale: A boolean indicating whether to grayscale the image.

"},{"location":"utils/image_utils/#returns_1","title":"Returns","text":""},{"location":"utils/image_utils/#load_image","title":"load_image","text":"
def load_image(image_file, target_size, grayscale, img_formats)\n

Load an image given its path. Returns an array version of optionally resized and grayed image. Only allows images of types described by img_formats argument.

"},{"location":"utils/image_utils/#args_3","title":"Args","text":"
  • image_file: Path to the image file.

  • target_size: Size to resize the input image to.

  • grayscale: A boolean indicating whether to grayscale the image.

  • img_formats: List of allowed image formats that can be loaded.

"},{"location":"utils/logger/","title":"Logger","text":""},{"location":"utils/logger/#return_logger","title":"return_logger","text":"
def return_logger(name)\n
"},{"location":"utils/models/","title":"Custom Models","text":""},{"location":"utils/models/#class-custommodel","title":"class CustomModel","text":"

A named tuple that can be used to initialize a custom PyTorch model.

"},{"location":"utils/models/#args","title":"Args","text":"
  • name: The name of the custom model. Default is 'default_model'.

  • model: The PyTorch model object which is a subclass of torch.nn.Module and implements the forward method and output a tensor of shape (batch_size x features). Alternatively, a call method is also accepted.. Default is None.

  • transform: A function that transforms a PIL.Image object into a PyTorch tensor that will be applied to each image before being fed to the model. Should correspond to the preprocessing logic of the supplied model. Default is None.

"},{"location":"utils/models/#class-mobilenetv3","title":"class MobilenetV3","text":""},{"location":"utils/models/#__init__","title":"__init__","text":"
def __init__()\n

Initialize a mobilenetv3 model, cuts it at the global average pooling layer and returns the output features.

"},{"location":"utils/models/#forward","title":"forward","text":"
def forward(x)\n
"},{"location":"utils/models/#class-vit","title":"class ViT","text":""},{"location":"utils/models/#__init___1","title":"__init__","text":"
def __init__()\n

Initialize a ViT model, takes mean of the final encoder layer outputs and returns those as features for a given image.

"},{"location":"utils/models/#forward_1","title":"forward","text":"
def forward(x)\n
"},{"location":"utils/models/#class-efficientnet","title":"class EfficientNet","text":""},{"location":"utils/models/#__init___2","title":"__init__","text":"
def __init__()\n

Initializes an EfficientNet model, cuts it at the global average pooling layer and returns the output features.

"},{"location":"utils/models/#forward_2","title":"forward","text":"
def forward(x)\n
"},{"location":"utils/plotter/","title":"Plot duplicates","text":""},{"location":"utils/plotter/#plot_duplicates","title":"plot_duplicates","text":"
def plot_duplicates(image_dir, duplicate_map, filename, outfile)\n

Given filename for an image, plot duplicates along with the original image using the duplicate map obtained using find_duplicates method.

"},{"location":"utils/plotter/#args","title":"Args","text":"
  • image_dir: image directory where all files in duplicate_map are present.

  • duplicate_map: mapping of filename to found duplicates (could be with or without scores).

  • filename: Name of the file for which duplicates are to be plotted, must be a key in the duplicate_map.

  • outfile: Optional, name of the file to save the plot. Default is None.

"},{"location":"utils/plotter/#example-usage","title":"Example usage","text":"
from imagededup.utils import plot_duplicates\nplot_duplicates(image_dir='path/to/image/directory',\nduplicate_map=duplicate_map,\nfilename='path/to/image.jpg')\n
"}]} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml index 89f5ebff..0f8724ef 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -1,128 +1,3 @@ - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - - - None - 2023-04-28 - daily - \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz index 84c5452f5adef7c8c58910fbce3e788607cddf6b..ebba662d81ad09a7eb318566b916d49ae032ca88 100644 GIT binary patch literal 127 zcmV-_0D%7=iwFphWPxP@|8r?{Wo=<_E_iKh04<9_3V)_WXo8&M?ytk3HC}0~zlG)Vu0AI-n_Kb9B-N zAf-Z+(ju**Z*TdC;Sn6}mR|q8+;1)`-<^(Ma&XF;g6vXCVrhh}m1zocyPlJrlssb3 zK|AT=Fl<~wTJQT4z&MVnQ(kse?@~K7;6xiP4%QH8dL%rGLrn{oZZ4*+;1_EogQ?p- zRL*y{<{8cQ2|XlP9!Az|GbY-~H1a;k2d^uoKMXd>BEpYE9C5@EM;vj)5l0+x#Qz(g QkMQN-UwHPEsG