Skip to content

Commit

Permalink
Merge pull request #593 from NASA-IMPACT/592-script-to-remove-duplica…
Browse files Browse the repository at this point in the history
…ted-urls-from-solar-system-exploration

python script added
  • Loading branch information
bishwaspraveen authored Feb 19, 2024
2 parents b977a1a + 6c1b5eb commit 82570cb
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions scripts/delete_duplicate_urls_on_webapp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from django.db.models import Count

from sde_collections.models.candidate_url import CandidateURL
from sde_collections.models.collection import Collection


def remove_duplicate_urls(collection_name):
"""
Removes duplicate CandidateURL entries for a given collection name.
Args:
- collection_name: The name of the collection for which to remove duplicate URLs.
"""
try:
collection = Collection.objects.get(name=collection_name)
except Collection.DoesNotExist:
print(f"Collection with name '{collection_name}' does not exist.")
return

duplicate_urls = (
CandidateURL.objects.filter(collection=collection)
.values("url")
.annotate(url_count=Count("id"))
.filter(url_count__gt=1)
)

for entry in duplicate_urls:
duplicate_entries = CandidateURL.objects.filter(collection=collection, url=entry["url"]).order_by("id")

duplicates_to_delete = duplicate_entries.exclude(id=duplicate_entries.first().id)
count_deleted = duplicates_to_delete.count()
duplicates_to_delete.delete()
print(f"Deleted {count_deleted} duplicate entries for URL '{entry['url']}'.")

print("Completed deleting duplicated URLs...")

0 comments on commit 82570cb

Please sign in to comment.