diff --git a/kitsune/sumo/management/commands/merge_similar_tags.py b/kitsune/sumo/management/commands/merge_similar_tags.py new file mode 100644 index 000000000..cd81cb7aa --- /dev/null +++ b/kitsune/sumo/management/commands/merge_similar_tags.py @@ -0,0 +1,55 @@ +from django.core.management.base import BaseCommand +from fuzzywuzzy import fuzz +from taggit.models import Tag, TaggedItem + +SIMILARITY_THRESHOLD = 75 + + +class Command(BaseCommand): + help = "Merge similar tags" + + def handle(self, *args, **kwargs): + def recursively_merge_tags(tag_ids): + merged_any = False + deleted_tags = set() + + for i, primary_tag_id in enumerate(tag_ids): + if primary_tag_id in deleted_tags: + continue + + primary_tag = Tag.objects.get(id=primary_tag_id) + + for secondary_tag_id in tag_ids[i + 1 :]: + if secondary_tag_id in deleted_tags: + continue + + secondary_tag = Tag.objects.get(id=secondary_tag_id) + similarity = fuzz.ratio(primary_tag.name, secondary_tag.name) + if similarity >= SIMILARITY_THRESHOLD: + duplicate_conflicts = TaggedItem.objects.filter( + tag=secondary_tag, + object_id__in=TaggedItem.objects.filter(tag=primary_tag).values_list( + "object_id", flat=True + ), + ) + duplicate_conflicts.delete() + + TaggedItem.objects.filter(tag=secondary_tag).update(tag=primary_tag) + + secondary_tag.delete() + deleted_tags.add(secondary_tag_id) + + print(f"Merged '{secondary_tag.name}' into '{primary_tag.name}'") + merged_any = True + break # start over + + if merged_any: + remaining_tag_ids = ( + Tag.objects.exclude(id__in=deleted_tags) + .order_by("-id") + .values_list("id", flat=True) + ) + return recursively_merge_tags(list(remaining_tag_ids)) + + tag_ids = Tag.objects.all().order_by("-id").values_list("id", flat=True) + recursively_merge_tags(list(tag_ids)) diff --git a/poetry.lock b/poetry.lock index 24d2542b4..49e9c8b9a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1450,6 +1450,20 @@ mccabe = ">=0.7.0,<0.8.0" pycodestyle = ">=2.12.0,<2.13.0" pyflakes = ">=3.2.0,<3.3.0" +[[package]] +name = "fuzzywuzzy" +version = "0.18.0" +description = "Fuzzy string matching in python" +optional = false +python-versions = "*" +files = [ + {file = "fuzzywuzzy-0.18.0-py2.py3-none-any.whl", hash = "sha256:928244b28db720d1e0ee7587acf660ea49d7e4c632569cad4f1cd7e68a5f0993"}, + {file = "fuzzywuzzy-0.18.0.tar.gz", hash = "sha256:45016e92264780e58972dca1b3d939ac864b78437422beecebb3095f8efd00e8"}, +] + +[package.extras] +speedup = ["python-levenshtein (>=0.12)"] + [[package]] name = "gevent" version = "23.9.1" @@ -3096,6 +3110,8 @@ files = [ {file = "psycopg2-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:426f9f29bde126913a20a96ff8ce7d73fd8a216cfb323b1f04da402d452853c3"}, {file = "psycopg2-2.9.9-cp311-cp311-win32.whl", hash = "sha256:ade01303ccf7ae12c356a5e10911c9e1c51136003a9a1d92f7aa9d010fb98372"}, {file = "psycopg2-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:121081ea2e76729acfb0673ff33755e8703d45e926e416cb59bae3a86c6a4981"}, + {file = "psycopg2-2.9.9-cp312-cp312-win32.whl", hash = "sha256:d735786acc7dd25815e89cc4ad529a43af779db2e25aa7c626de864127e5a024"}, + {file = "psycopg2-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:a7653d00b732afb6fc597e29c50ad28087dcb4fbfb28e86092277a559ae4e693"}, {file = "psycopg2-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:5e0d98cade4f0e0304d7d6f25bbfbc5bd186e07b38eac65379309c4ca3193efa"}, {file = "psycopg2-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:7e2dacf8b009a1c1e843b5213a87f7c544b2b042476ed7755be813eaf4e8347a"}, {file = "psycopg2-2.9.9-cp38-cp38-win32.whl", hash = "sha256:ff432630e510709564c01dafdbe996cb552e0b9f3f065eb89bdce5bd31fabf4c"}, @@ -5022,4 +5038,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "eba2d4699a2487fddd0734ba39fe2e2c52958212407ddf1ac4e0c9dea837f528" +content-hash = "c156bc4650e5c519f35bbe1e39525a27cbcd72932a0e212f3eaeb066862de668" diff --git a/pyproject.toml b/pyproject.toml index 4a77b50d1..8f5721331 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,6 +90,7 @@ google-analytics-data = "0.18.7" pyparsing = "3.1.2" django-silk = "^5.1.0" requests = "^2.32.3" +fuzzywuzzy = "^0.18.0" [tool.poetry.group.dev.dependencies] ipdb = "^0.13.11"