final
This commit is contained in:
Binary file not shown.
@@ -35,6 +35,20 @@ IGNORE_SELECTORS = [
|
|||||||
|
|
||||||
CHUNK_SIZE = 9000 # stay under 10k limit
|
CHUNK_SIZE = 9000 # stay under 10k limit
|
||||||
|
|
||||||
|
# LanguageTool rule IDs to silently ignore
|
||||||
|
IGNORED_RULES = {
|
||||||
|
"LEERZEICHEN_VOR_SATZZEICHEN",
|
||||||
|
"WHITESPACE_RULE",
|
||||||
|
"WHITESPACE_BEFORE_PUNCTUATION",
|
||||||
|
"LEERZEICHEN_VOR_DOPPELPUNKT",
|
||||||
|
"LEERZEICHEN_VOR_SEMIKOLON",
|
||||||
|
"LEERZEICHEN_VOR_AUSRUFEZEICHEN",
|
||||||
|
"LEERZEICHEN_VOR_FRAGEZEICHEN",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Regex: space(s) directly before punctuation — catch any remaining cases
|
||||||
|
SPACE_BEFORE_PUNCT_RE = re.compile(r"\s+[,.:;!?\"\u201C\u201D\u201E\u201F]")
|
||||||
|
|
||||||
|
|
||||||
def normalize_url(url: str) -> str:
|
def normalize_url(url: str) -> str:
|
||||||
"""Remove fragment and trailing slash for dedup."""
|
"""Remove fragment and trailing slash for dedup."""
|
||||||
@@ -289,6 +303,15 @@ def check_text_with_languagetool(
|
|||||||
if resp.status_code == 200:
|
if resp.status_code == 200:
|
||||||
result = resp.json()
|
result = resp.json()
|
||||||
for match in result.get("matches", []):
|
for match in result.get("matches", []):
|
||||||
|
rule_id = match.get("rule", {}).get("id", "")
|
||||||
|
if rule_id in IGNORED_RULES:
|
||||||
|
continue
|
||||||
|
# Extra filter: skip any match that is just whitespace before punctuation
|
||||||
|
m_off = match.get("offset", 0)
|
||||||
|
m_len = match.get("length", 0)
|
||||||
|
matched_text = chunk_text[m_off:m_off + m_len]
|
||||||
|
if SPACE_BEFORE_PUNCT_RE.fullmatch(matched_text):
|
||||||
|
continue
|
||||||
match["offset"] += offset
|
match["offset"] += offset
|
||||||
all_matches.append(match)
|
all_matches.append(match)
|
||||||
elif resp.status_code in (401, 403):
|
elif resp.status_code in (401, 403):
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user