Python
Django
Remove Duplicate Rows

Removing Duplicate Rows

If data is duplicated on a table it can be a problem, especially where the field needs to be unique. This can happen where data has become corrupted in some way.

On a table with many rows this can be challenging to resolve, so it's important to minimise what's loaded into memory. The approaches below just operate on ids of duplicates, then processes them individually.

Delete rows with duplicate IDs

import time
from django.db.models import Count
from myapp.models import MyModel
 
# Get a list of IDs of duplicated rows
duplicates = (
    MyModel.objects.values("id")
    .annotate(name_count=Count("id"))
    .filter(name_count__gt=1)
)
 
# Iterate over the rows, deleting all but the first
total = duplicates.count()
for count, duplicate in enumerate(duplicates):
    print(f"Processing {count} of {total}")
    dupes = MyModel.objects.filter(id=duplicate["id"])[1:]
    for dupe in dupes:
        print(f"Deleting {dupe.id}")
        # dupe.delete()  # Commented for safety
    time.sleep(1)  # This may be needed to avoid maxing out connections to the database

Save data to CSV before deleting

import csv
import time
from django.db.models import Count
from myapp.models import MyModel
 
# Get a list of IDs of duplicated rows
duplicates = (
    MyModel.objects.values("id")
    .annotate(name_count=Count("id"))
    .filter(name_count__gt=1)
)
 
with open("/path/to/my_model_duplicate_deletions.csv", "w") as csvfile:
    c = csv.writer(csvfile)
    c.writerow(
        [
            "field1",
            "field2",
        ]
    )
    # Iterate over the rows, deleting all but the first
    total = duplicates.count()
    for count, duplicate in enumerate(duplicates):
        print(f"Processing {count} of {total}")
        dupes = MyModel.objects.filter(id=duplicate["id"])[1:]
        for dupe in dupes:
            print(f"Deleting {dupe.id}")
            c.writerow(
                [
                    dupe["field1"],
                    dupe["field2"],
                ]
            )
            # dupe.delete()  # Commented for safety
        time.sleep(1)  # This may be needed to avoid maxing out connections to the database