md.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559
  1. from functools import lru_cache
  2. from typing import List, Optional
  3. from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD
  4. from .utils import (
  5. is_accentuated,
  6. is_ascii,
  7. is_case_variable,
  8. is_cjk,
  9. is_emoticon,
  10. is_hangul,
  11. is_hiragana,
  12. is_katakana,
  13. is_latin,
  14. is_punctuation,
  15. is_separator,
  16. is_symbol,
  17. is_thai,
  18. remove_accent,
  19. unicode_range,
  20. )
  21. class MessDetectorPlugin:
  22. """
  23. Base abstract class used for mess detection plugins.
  24. All detectors MUST extend and implement given methods.
  25. """
  26. def eligible(self, character: str) -> bool:
  27. """
  28. Determine if given character should be fed in.
  29. """
  30. raise NotImplementedError # pragma: nocover
  31. def feed(self, character: str) -> None:
  32. """
  33. The main routine to be executed upon character.
  34. Insert the logic in witch the text would be considered chaotic.
  35. """
  36. raise NotImplementedError # pragma: nocover
  37. def reset(self) -> None: # pragma: no cover
  38. """
  39. Permit to reset the plugin to the initial state.
  40. """
  41. raise NotImplementedError
  42. @property
  43. def ratio(self) -> float:
  44. """
  45. Compute the chaos ratio based on what your feed() has seen.
  46. Must NOT be lower than 0.; No restriction gt 0.
  47. """
  48. raise NotImplementedError # pragma: nocover
  49. class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
  50. def __init__(self) -> None:
  51. self._punctuation_count = 0 # type: int
  52. self._symbol_count = 0 # type: int
  53. self._character_count = 0 # type: int
  54. self._last_printable_char = None # type: Optional[str]
  55. self._frenzy_symbol_in_word = False # type: bool
  56. def eligible(self, character: str) -> bool:
  57. return character.isprintable()
  58. def feed(self, character: str) -> None:
  59. self._character_count += 1
  60. if (
  61. character != self._last_printable_char
  62. and character not in COMMON_SAFE_ASCII_CHARACTERS
  63. ):
  64. if is_punctuation(character):
  65. self._punctuation_count += 1
  66. elif (
  67. character.isdigit() is False
  68. and is_symbol(character)
  69. and is_emoticon(character) is False
  70. ):
  71. self._symbol_count += 2
  72. self._last_printable_char = character
  73. def reset(self) -> None: # pragma: no cover
  74. self._punctuation_count = 0
  75. self._character_count = 0
  76. self._symbol_count = 0
  77. @property
  78. def ratio(self) -> float:
  79. if self._character_count == 0:
  80. return 0.0
  81. ratio_of_punctuation = (
  82. self._punctuation_count + self._symbol_count
  83. ) / self._character_count # type: float
  84. return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
  85. class TooManyAccentuatedPlugin(MessDetectorPlugin):
  86. def __init__(self) -> None:
  87. self._character_count = 0 # type: int
  88. self._accentuated_count = 0 # type: int
  89. def eligible(self, character: str) -> bool:
  90. return character.isalpha()
  91. def feed(self, character: str) -> None:
  92. self._character_count += 1
  93. if is_accentuated(character):
  94. self._accentuated_count += 1
  95. def reset(self) -> None: # pragma: no cover
  96. self._character_count = 0
  97. self._accentuated_count = 0
  98. @property
  99. def ratio(self) -> float:
  100. if self._character_count == 0:
  101. return 0.0
  102. ratio_of_accentuation = (
  103. self._accentuated_count / self._character_count
  104. ) # type: float
  105. return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
  106. class UnprintablePlugin(MessDetectorPlugin):
  107. def __init__(self) -> None:
  108. self._unprintable_count = 0 # type: int
  109. self._character_count = 0 # type: int
  110. def eligible(self, character: str) -> bool:
  111. return True
  112. def feed(self, character: str) -> None:
  113. if (
  114. character.isspace() is False # includes \n \t \r \v
  115. and character.isprintable() is False
  116. and character != "\x1A" # Why? Its the ASCII substitute character.
  117. ):
  118. self._unprintable_count += 1
  119. self._character_count += 1
  120. def reset(self) -> None: # pragma: no cover
  121. self._unprintable_count = 0
  122. @property
  123. def ratio(self) -> float:
  124. if self._character_count == 0:
  125. return 0.0
  126. return (self._unprintable_count * 8) / self._character_count
  127. class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
  128. def __init__(self) -> None:
  129. self._successive_count = 0 # type: int
  130. self._character_count = 0 # type: int
  131. self._last_latin_character = None # type: Optional[str]
  132. def eligible(self, character: str) -> bool:
  133. return character.isalpha() and is_latin(character)
  134. def feed(self, character: str) -> None:
  135. self._character_count += 1
  136. if (
  137. self._last_latin_character is not None
  138. and is_accentuated(character)
  139. and is_accentuated(self._last_latin_character)
  140. ):
  141. if character.isupper() and self._last_latin_character.isupper():
  142. self._successive_count += 1
  143. # Worse if its the same char duplicated with different accent.
  144. if remove_accent(character) == remove_accent(self._last_latin_character):
  145. self._successive_count += 1
  146. self._last_latin_character = character
  147. def reset(self) -> None: # pragma: no cover
  148. self._successive_count = 0
  149. self._character_count = 0
  150. self._last_latin_character = None
  151. @property
  152. def ratio(self) -> float:
  153. if self._character_count == 0:
  154. return 0.0
  155. return (self._successive_count * 2) / self._character_count
  156. class SuspiciousRange(MessDetectorPlugin):
  157. def __init__(self) -> None:
  158. self._suspicious_successive_range_count = 0 # type: int
  159. self._character_count = 0 # type: int
  160. self._last_printable_seen = None # type: Optional[str]
  161. def eligible(self, character: str) -> bool:
  162. return character.isprintable()
  163. def feed(self, character: str) -> None:
  164. self._character_count += 1
  165. if (
  166. character.isspace()
  167. or is_punctuation(character)
  168. or character in COMMON_SAFE_ASCII_CHARACTERS
  169. ):
  170. self._last_printable_seen = None
  171. return
  172. if self._last_printable_seen is None:
  173. self._last_printable_seen = character
  174. return
  175. unicode_range_a = unicode_range(
  176. self._last_printable_seen
  177. ) # type: Optional[str]
  178. unicode_range_b = unicode_range(character) # type: Optional[str]
  179. if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
  180. self._suspicious_successive_range_count += 1
  181. self._last_printable_seen = character
  182. def reset(self) -> None: # pragma: no cover
  183. self._character_count = 0
  184. self._suspicious_successive_range_count = 0
  185. self._last_printable_seen = None
  186. @property
  187. def ratio(self) -> float:
  188. if self._character_count == 0:
  189. return 0.0
  190. ratio_of_suspicious_range_usage = (
  191. self._suspicious_successive_range_count * 2
  192. ) / self._character_count # type: float
  193. if ratio_of_suspicious_range_usage < 0.1:
  194. return 0.0
  195. return ratio_of_suspicious_range_usage
  196. class SuperWeirdWordPlugin(MessDetectorPlugin):
  197. def __init__(self) -> None:
  198. self._word_count = 0 # type: int
  199. self._bad_word_count = 0 # type: int
  200. self._foreign_long_count = 0 # type: int
  201. self._is_current_word_bad = False # type: bool
  202. self._foreign_long_watch = False # type: bool
  203. self._character_count = 0 # type: int
  204. self._bad_character_count = 0 # type: int
  205. self._buffer = "" # type: str
  206. self._buffer_accent_count = 0 # type: int
  207. def eligible(self, character: str) -> bool:
  208. return True
  209. def feed(self, character: str) -> None:
  210. if character.isalpha():
  211. self._buffer = "".join([self._buffer, character])
  212. if is_accentuated(character):
  213. self._buffer_accent_count += 1
  214. if (
  215. self._foreign_long_watch is False
  216. and (is_latin(character) is False or is_accentuated(character))
  217. and is_cjk(character) is False
  218. and is_hangul(character) is False
  219. and is_katakana(character) is False
  220. and is_hiragana(character) is False
  221. and is_thai(character) is False
  222. ):
  223. self._foreign_long_watch = True
  224. return
  225. if not self._buffer:
  226. return
  227. if (
  228. character.isspace() or is_punctuation(character) or is_separator(character)
  229. ) and self._buffer:
  230. self._word_count += 1
  231. buffer_length = len(self._buffer) # type: int
  232. self._character_count += buffer_length
  233. if buffer_length >= 4:
  234. if self._buffer_accent_count / buffer_length > 0.34:
  235. self._is_current_word_bad = True
  236. # Word/Buffer ending with a upper case accentuated letter are so rare,
  237. # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
  238. if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
  239. self._foreign_long_count += 1
  240. self._is_current_word_bad = True
  241. if buffer_length >= 24 and self._foreign_long_watch:
  242. self._foreign_long_count += 1
  243. self._is_current_word_bad = True
  244. if self._is_current_word_bad:
  245. self._bad_word_count += 1
  246. self._bad_character_count += len(self._buffer)
  247. self._is_current_word_bad = False
  248. self._foreign_long_watch = False
  249. self._buffer = ""
  250. self._buffer_accent_count = 0
  251. elif (
  252. character not in {"<", ">", "-", "="}
  253. and character.isdigit() is False
  254. and is_symbol(character)
  255. ):
  256. self._is_current_word_bad = True
  257. self._buffer += character
  258. def reset(self) -> None: # pragma: no cover
  259. self._buffer = ""
  260. self._is_current_word_bad = False
  261. self._foreign_long_watch = False
  262. self._bad_word_count = 0
  263. self._word_count = 0
  264. self._character_count = 0
  265. self._bad_character_count = 0
  266. self._foreign_long_count = 0
  267. @property
  268. def ratio(self) -> float:
  269. if self._word_count <= 10 and self._foreign_long_count == 0:
  270. return 0.0
  271. return self._bad_character_count / self._character_count
  272. class CjkInvalidStopPlugin(MessDetectorPlugin):
  273. """
  274. GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
  275. can be easily detected. Searching for the overuse of '丅' and '丄'.
  276. """
  277. def __init__(self) -> None:
  278. self._wrong_stop_count = 0 # type: int
  279. self._cjk_character_count = 0 # type: int
  280. def eligible(self, character: str) -> bool:
  281. return True
  282. def feed(self, character: str) -> None:
  283. if character in {"丅", "丄"}:
  284. self._wrong_stop_count += 1
  285. return
  286. if is_cjk(character):
  287. self._cjk_character_count += 1
  288. def reset(self) -> None: # pragma: no cover
  289. self._wrong_stop_count = 0
  290. self._cjk_character_count = 0
  291. @property
  292. def ratio(self) -> float:
  293. if self._cjk_character_count < 16:
  294. return 0.0
  295. return self._wrong_stop_count / self._cjk_character_count
  296. class ArchaicUpperLowerPlugin(MessDetectorPlugin):
  297. def __init__(self) -> None:
  298. self._buf = False # type: bool
  299. self._character_count_since_last_sep = 0 # type: int
  300. self._successive_upper_lower_count = 0 # type: int
  301. self._successive_upper_lower_count_final = 0 # type: int
  302. self._character_count = 0 # type: int
  303. self._last_alpha_seen = None # type: Optional[str]
  304. self._current_ascii_only = True # type: bool
  305. def eligible(self, character: str) -> bool:
  306. return True
  307. def feed(self, character: str) -> None:
  308. is_concerned = character.isalpha() and is_case_variable(character)
  309. chunk_sep = is_concerned is False
  310. if chunk_sep and self._character_count_since_last_sep > 0:
  311. if (
  312. self._character_count_since_last_sep <= 64
  313. and character.isdigit() is False
  314. and self._current_ascii_only is False
  315. ):
  316. self._successive_upper_lower_count_final += (
  317. self._successive_upper_lower_count
  318. )
  319. self._successive_upper_lower_count = 0
  320. self._character_count_since_last_sep = 0
  321. self._last_alpha_seen = None
  322. self._buf = False
  323. self._character_count += 1
  324. self._current_ascii_only = True
  325. return
  326. if self._current_ascii_only is True and is_ascii(character) is False:
  327. self._current_ascii_only = False
  328. if self._last_alpha_seen is not None:
  329. if (character.isupper() and self._last_alpha_seen.islower()) or (
  330. character.islower() and self._last_alpha_seen.isupper()
  331. ):
  332. if self._buf is True:
  333. self._successive_upper_lower_count += 2
  334. self._buf = False
  335. else:
  336. self._buf = True
  337. else:
  338. self._buf = False
  339. self._character_count += 1
  340. self._character_count_since_last_sep += 1
  341. self._last_alpha_seen = character
  342. def reset(self) -> None: # pragma: no cover
  343. self._character_count = 0
  344. self._character_count_since_last_sep = 0
  345. self._successive_upper_lower_count = 0
  346. self._successive_upper_lower_count_final = 0
  347. self._last_alpha_seen = None
  348. self._buf = False
  349. self._current_ascii_only = True
  350. @property
  351. def ratio(self) -> float:
  352. if self._character_count == 0:
  353. return 0.0
  354. return self._successive_upper_lower_count_final / self._character_count
  355. def is_suspiciously_successive_range(
  356. unicode_range_a: Optional[str], unicode_range_b: Optional[str]
  357. ) -> bool:
  358. """
  359. Determine if two Unicode range seen next to each other can be considered as suspicious.
  360. """
  361. if unicode_range_a is None or unicode_range_b is None:
  362. return True
  363. if unicode_range_a == unicode_range_b:
  364. return False
  365. if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
  366. return False
  367. if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
  368. return False
  369. # Latin characters can be accompanied with a combining diacritical mark
  370. # eg. Vietnamese.
  371. if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
  372. "Combining" in unicode_range_a or "Combining" in unicode_range_b
  373. ):
  374. return False
  375. keywords_range_a, keywords_range_b = unicode_range_a.split(
  376. " "
  377. ), unicode_range_b.split(" ")
  378. for el in keywords_range_a:
  379. if el in UNICODE_SECONDARY_RANGE_KEYWORD:
  380. continue
  381. if el in keywords_range_b:
  382. return False
  383. # Japanese Exception
  384. range_a_jp_chars, range_b_jp_chars = (
  385. unicode_range_a
  386. in (
  387. "Hiragana",
  388. "Katakana",
  389. ),
  390. unicode_range_b in ("Hiragana", "Katakana"),
  391. )
  392. if (range_a_jp_chars or range_b_jp_chars) and (
  393. "CJK" in unicode_range_a or "CJK" in unicode_range_b
  394. ):
  395. return False
  396. if range_a_jp_chars and range_b_jp_chars:
  397. return False
  398. if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
  399. if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
  400. return False
  401. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  402. return False
  403. # Chinese/Japanese use dedicated range for punctuation and/or separators.
  404. if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
  405. unicode_range_a in ["Katakana", "Hiragana"]
  406. and unicode_range_b in ["Katakana", "Hiragana"]
  407. ):
  408. if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
  409. return False
  410. if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
  411. return False
  412. return True
  413. @lru_cache(maxsize=2048)
  414. def mess_ratio(
  415. decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
  416. ) -> float:
  417. """
  418. Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
  419. """
  420. detectors = [
  421. md_class() for md_class in MessDetectorPlugin.__subclasses__()
  422. ] # type: List[MessDetectorPlugin]
  423. length = len(decoded_sequence) + 1 # type: int
  424. mean_mess_ratio = 0.0 # type: float
  425. if length < 512:
  426. intermediary_mean_mess_ratio_calc = 32 # type: int
  427. elif length <= 1024:
  428. intermediary_mean_mess_ratio_calc = 64
  429. else:
  430. intermediary_mean_mess_ratio_calc = 128
  431. for character, index in zip(decoded_sequence + "\n", range(length)):
  432. for detector in detectors:
  433. if detector.eligible(character):
  434. detector.feed(character)
  435. if (
  436. index > 0 and index % intermediary_mean_mess_ratio_calc == 0
  437. ) or index == length - 1:
  438. mean_mess_ratio = sum(dt.ratio for dt in detectors)
  439. if mean_mess_ratio >= maximum_threshold:
  440. break
  441. if debug:
  442. for dt in detectors: # pragma: nocover
  443. print(dt.__class__, dt.ratio)
  444. return round(mean_mess_ratio, 3)