Merge pull request #6357 from smithellis/1952-phone-number-forum-spam

Improve spam filtering
This commit is contained in:
Tasos Katsoulas 2024-11-20 11:22:21 +02:00 коммит произвёл GitHub
Родитель 03b735d4f1 5f7737d8c2
Коммит 26d945bf60
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
4 изменённых файлов: 44 добавлений и 16 удалений

Просмотреть файл

@ -10,6 +10,7 @@ from kitsune.questions.events import QuestionReplyEvent
from kitsune.questions.models import AAQConfig, Answer, Question from kitsune.questions.models import AAQConfig, Answer, Question
from kitsune.questions.utils import remove_pii from kitsune.questions.utils import remove_pii
from kitsune.sumo.forms import KitsuneBaseForumForm from kitsune.sumo.forms import KitsuneBaseForumForm
from kitsune.sumo.utils import check_for_spam_content
from kitsune.upload.models import ImageAttachment from kitsune.upload.models import ImageAttachment
# labels and help text # labels and help text
@ -185,6 +186,24 @@ class NewQuestionForm(EditQuestionForm):
topics = Topic.active.filter(products=product, in_aaq=True) topics = Topic.active.filter(products=product, in_aaq=True)
self.fields["category"].queryset = topics self.fields["category"].queryset = topics
def clean(self, *args, **kwargs):
"""
Generic clean method used by all forms in the question app.
Parse content for suspicious content.
- Toll free numbers
- NANP numbers
- Links - not necessarily spam content
"""
cdata = self.cleaned_data.get("content")
if not cdata:
return super().clean(*args, **kwargs)
if check_for_spam_content(cdata):
self.cleaned_data.update({"is_spam": True})
return self.cleaned_data
def save(self, user, locale, product, *args, **kwargs): def save(self, user, locale, product, *args, **kwargs):
self.instance.creator = user self.instance.creator = user
self.instance.locale = locale self.instance.locale = locale
@ -228,15 +247,6 @@ class AnswerForm(KitsuneBaseForumForm):
model = Answer model = Answer
fields = ("content",) fields = ("content",)
def clean(self, *args, **kwargs):
"""Override clean method to exempt question owner from spam filtering."""
cdata = super(AnswerForm, self).clean(*args, **kwargs)
# if there is a reply from the owner, remove the spam flag
if self.user and self.question and self.user == self.question.creator:
cdata.pop("is_spam", None)
return cdata
class WatchQuestionForm(forms.Form): class WatchQuestionForm(forms.Form):
"""Form to subscribe to question updates.""" """Form to subscribe to question updates."""

Просмотреть файл

@ -639,12 +639,12 @@ def aaq(request, product_slug=None, step=1, is_loginless=False):
product=product, product=product,
) )
if form.cleaned_data.get("is_spam"):
_add_to_moderation_queue(request, question)
# Submitting the question counts as a vote # Submitting the question counts as a vote
question_vote(request, question.id) question_vote(request, question.id)
if form.cleaned_data.get("is_spam"):
_add_to_moderation_queue(request, question)
my_questions_url = reverse("users.questions", args=[request.user.username]) my_questions_url = reverse("users.questions", args=[request.user.username])
messages.add_message( messages.add_message(
request, request,

Просмотреть файл

@ -1136,9 +1136,26 @@ ALLOW_LINKS_FROM = [
] ]
# Regexes # Regexes
TOLL_FREE_REGEX = re.compile(r"^.*8(00|33|44|55|66|77|88)[2-9]\d{6,}$")
REGEX_TIMEOUT = config("REGEX_TIMEOUT", default=5, cast=int) REGEX_TIMEOUT = config("REGEX_TIMEOUT", default=5, cast=int)
TOLL_FREE_REGEX = re.compile(r"^.*8(00|33|44|55|66|77|88)[2-9]\d{6,}$")
NANP_REGEX = re.compile(r"[0-9]{3}-?[a-zA-Z2-9][a-zA-Z0-9]{2}-?[a-zA-Z0-9]{4}") NANP_REGEX = re.compile(r"[0-9]{3}-?[a-zA-Z2-9][a-zA-Z0-9]{2}-?[a-zA-Z0-9]{4}")
ANY_PHONE_NUMBER = re.compile(
r"""
(?<!\w) # Assert position is not preceded by a word character (prevents partial matches)
(?:\+|00|011)? # Match optional country code prefix (+, 00, or 011)
[\s.-]?\(? # Optional separator (space, dot, dash) and optional opening parenthesis
\d{1,4} # Match 1-4 digits (area code or first part of the number)
\)? # Optional closing parenthesis
[\s.-]? # Optional separator (space, dot, dash)
\d{1,4} # Match 1-4 digits (first part of the phone number)
[\s.-]? # Optional separator (space, dot, dash)
\d{1,4} # Match 1-4 digits (second part of the phone number)
[\s.-]? # Optional separator (space, dot, dash)
\d{1,9} # Match 1-9 digits (remaining part of the phone number)
(?!\w) # Assert position is not followed by a word character (prevents partial matches)
""",
re.VERBOSE, # Allows for the use of comments and whitespace in the pattern for readability
)
if ES_ENABLE_CONSOLE_LOGGING and DEV: if ES_ENABLE_CONSOLE_LOGGING and DEV:
es_trace_logger = logging.getLogger("elasticsearch.trace") es_trace_logger = logging.getLogger("elasticsearch.trace")

Просмотреть файл

@ -321,17 +321,18 @@ def check_for_spam_content(data):
- Toll free numbers - Toll free numbers
- Vanity toll free numbers - Vanity toll free numbers
- Links in the text. - Links in the text.
- Any phone number-ish string of digits
""" """
# keep only the digits in text # keep only digits
digits = "".join(filter(type(data).isdigit, data)) digits = "".join(filter(type(data).isdigit, data))
is_toll_free = settings.TOLL_FREE_REGEX.match(digits) is_toll_free = settings.TOLL_FREE_REGEX.match(digits)
is_nanp_number = any(settings.NANP_REGEX.findall(data)) is_nanp_number = any(settings.NANP_REGEX.findall(data))
is_phone_number = any(settings.ANY_PHONE_NUMBER.findall(data))
has_links = has_blocked_link(data) has_links = has_blocked_link(data)
return is_toll_free or is_nanp_number or has_links return is_toll_free or is_nanp_number or is_phone_number or has_links
@lru_cache(maxsize=settings.WEBPACK_LRU_CACHE) @lru_cache(maxsize=settings.WEBPACK_LRU_CACHE)