final
This commit is contained in:
Binary file not shown.
@@ -35,6 +35,20 @@ IGNORE_SELECTORS = [
|
||||
|
||||
CHUNK_SIZE = 9000 # stay under 10k limit
|
||||
|
||||
# LanguageTool rule IDs to silently ignore
|
||||
IGNORED_RULES = {
|
||||
"LEERZEICHEN_VOR_SATZZEICHEN",
|
||||
"WHITESPACE_RULE",
|
||||
"WHITESPACE_BEFORE_PUNCTUATION",
|
||||
"LEERZEICHEN_VOR_DOPPELPUNKT",
|
||||
"LEERZEICHEN_VOR_SEMIKOLON",
|
||||
"LEERZEICHEN_VOR_AUSRUFEZEICHEN",
|
||||
"LEERZEICHEN_VOR_FRAGEZEICHEN",
|
||||
}
|
||||
|
||||
# Regex: space(s) directly before punctuation — catch any remaining cases
|
||||
SPACE_BEFORE_PUNCT_RE = re.compile(r"\s+[,.:;!?\"\u201C\u201D\u201E\u201F]")
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""Remove fragment and trailing slash for dedup."""
|
||||
@@ -289,6 +303,15 @@ def check_text_with_languagetool(
|
||||
if resp.status_code == 200:
|
||||
result = resp.json()
|
||||
for match in result.get("matches", []):
|
||||
rule_id = match.get("rule", {}).get("id", "")
|
||||
if rule_id in IGNORED_RULES:
|
||||
continue
|
||||
# Extra filter: skip any match that is just whitespace before punctuation
|
||||
m_off = match.get("offset", 0)
|
||||
m_len = match.get("length", 0)
|
||||
matched_text = chunk_text[m_off:m_off + m_len]
|
||||
if SPACE_BEFORE_PUNCT_RE.fullmatch(matched_text):
|
||||
continue
|
||||
match["offset"] += offset
|
||||
all_matches.append(match)
|
||||
elif resp.status_code in (401, 403):
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user