utils.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. try:
  2. import unicodedata2 as unicodedata
  3. except ImportError:
  4. import unicodedata # type: ignore[no-redef]
  5. import importlib
  6. import logging
  7. from codecs import IncrementalDecoder
  8. from encodings.aliases import aliases
  9. from functools import lru_cache
  10. from re import findall
  11. from typing import List, Optional, Set, Tuple, Union
  12. from _multibytecodec import MultibyteIncrementalDecoder # type: ignore
  13. from .constant import (
  14. ENCODING_MARKS,
  15. IANA_SUPPORTED_SIMILAR,
  16. RE_POSSIBLE_ENCODING_INDICATION,
  17. UNICODE_RANGES_COMBINED,
  18. UNICODE_SECONDARY_RANGE_KEYWORD,
  19. UTF8_MAXIMAL_ALLOCATION,
  20. )
  21. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  22. def is_accentuated(character: str) -> bool:
  23. try:
  24. description = unicodedata.name(character) # type: str
  25. except ValueError:
  26. return False
  27. return (
  28. "WITH GRAVE" in description
  29. or "WITH ACUTE" in description
  30. or "WITH CEDILLA" in description
  31. or "WITH DIAERESIS" in description
  32. or "WITH CIRCUMFLEX" in description
  33. or "WITH TILDE" in description
  34. )
  35. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  36. def remove_accent(character: str) -> str:
  37. decomposed = unicodedata.decomposition(character) # type: str
  38. if not decomposed:
  39. return character
  40. codes = decomposed.split(" ") # type: List[str]
  41. return chr(int(codes[0], 16))
  42. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  43. def unicode_range(character: str) -> Optional[str]:
  44. """
  45. Retrieve the Unicode range official name from a single character.
  46. """
  47. character_ord = ord(character) # type: int
  48. for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
  49. if character_ord in ord_range:
  50. return range_name
  51. return None
  52. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  53. def is_latin(character: str) -> bool:
  54. try:
  55. description = unicodedata.name(character) # type: str
  56. except ValueError:
  57. return False
  58. return "LATIN" in description
  59. def is_ascii(character: str) -> bool:
  60. try:
  61. character.encode("ascii")
  62. except UnicodeEncodeError:
  63. return False
  64. return True
  65. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  66. def is_punctuation(character: str) -> bool:
  67. character_category = unicodedata.category(character) # type: str
  68. if "P" in character_category:
  69. return True
  70. character_range = unicode_range(character) # type: Optional[str]
  71. if character_range is None:
  72. return False
  73. return "Punctuation" in character_range
  74. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  75. def is_symbol(character: str) -> bool:
  76. character_category = unicodedata.category(character) # type: str
  77. if "S" in character_category or "N" in character_category:
  78. return True
  79. character_range = unicode_range(character) # type: Optional[str]
  80. if character_range is None:
  81. return False
  82. return "Forms" in character_range
  83. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  84. def is_emoticon(character: str) -> bool:
  85. character_range = unicode_range(character) # type: Optional[str]
  86. if character_range is None:
  87. return False
  88. return "Emoticons" in character_range
  89. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  90. def is_separator(character: str) -> bool:
  91. if character.isspace() or character in {"|", "+", ",", ";", "<", ">"}:
  92. return True
  93. character_category = unicodedata.category(character) # type: str
  94. return "Z" in character_category
  95. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  96. def is_case_variable(character: str) -> bool:
  97. return character.islower() != character.isupper()
  98. def is_private_use_only(character: str) -> bool:
  99. character_category = unicodedata.category(character) # type: str
  100. return character_category == "Co"
  101. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  102. def is_cjk(character: str) -> bool:
  103. try:
  104. character_name = unicodedata.name(character)
  105. except ValueError:
  106. return False
  107. return "CJK" in character_name
  108. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  109. def is_hiragana(character: str) -> bool:
  110. try:
  111. character_name = unicodedata.name(character)
  112. except ValueError:
  113. return False
  114. return "HIRAGANA" in character_name
  115. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  116. def is_katakana(character: str) -> bool:
  117. try:
  118. character_name = unicodedata.name(character)
  119. except ValueError:
  120. return False
  121. return "KATAKANA" in character_name
  122. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  123. def is_hangul(character: str) -> bool:
  124. try:
  125. character_name = unicodedata.name(character)
  126. except ValueError:
  127. return False
  128. return "HANGUL" in character_name
  129. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  130. def is_thai(character: str) -> bool:
  131. try:
  132. character_name = unicodedata.name(character)
  133. except ValueError:
  134. return False
  135. return "THAI" in character_name
  136. @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
  137. def is_unicode_range_secondary(range_name: str) -> bool:
  138. return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
  139. def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
  140. """
  141. Extract using ASCII-only decoder any specified encoding in the first n-bytes.
  142. """
  143. if not isinstance(sequence, bytes):
  144. raise TypeError
  145. seq_len = len(sequence) # type: int
  146. results = findall(
  147. RE_POSSIBLE_ENCODING_INDICATION,
  148. sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
  149. ) # type: List[str]
  150. if len(results) == 0:
  151. return None
  152. for specified_encoding in results:
  153. specified_encoding = specified_encoding.lower().replace("-", "_")
  154. for encoding_alias, encoding_iana in aliases.items():
  155. if encoding_alias == specified_encoding:
  156. return encoding_iana
  157. if encoding_iana == specified_encoding:
  158. return encoding_iana
  159. return None
  160. @lru_cache(maxsize=128)
  161. def is_multi_byte_encoding(name: str) -> bool:
  162. """
  163. Verify is a specific encoding is a multi byte one based on it IANA name
  164. """
  165. return name in {
  166. "utf_8",
  167. "utf_8_sig",
  168. "utf_16",
  169. "utf_16_be",
  170. "utf_16_le",
  171. "utf_32",
  172. "utf_32_le",
  173. "utf_32_be",
  174. "utf_7",
  175. } or issubclass(
  176. importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore
  177. MultibyteIncrementalDecoder,
  178. )
  179. def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
  180. """
  181. Identify and extract SIG/BOM in given sequence.
  182. """
  183. for iana_encoding in ENCODING_MARKS:
  184. marks = ENCODING_MARKS[iana_encoding] # type: Union[bytes, List[bytes]]
  185. if isinstance(marks, bytes):
  186. marks = [marks]
  187. for mark in marks:
  188. if sequence.startswith(mark):
  189. return iana_encoding, mark
  190. return None, b""
  191. def should_strip_sig_or_bom(iana_encoding: str) -> bool:
  192. return iana_encoding not in {"utf_16", "utf_32"}
  193. def iana_name(cp_name: str, strict: bool = True) -> str:
  194. cp_name = cp_name.lower().replace("-", "_")
  195. for encoding_alias, encoding_iana in aliases.items():
  196. if cp_name in [encoding_alias, encoding_iana]:
  197. return encoding_iana
  198. if strict:
  199. raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
  200. return cp_name
  201. def range_scan(decoded_sequence: str) -> List[str]:
  202. ranges = set() # type: Set[str]
  203. for character in decoded_sequence:
  204. character_range = unicode_range(character) # type: Optional[str]
  205. if character_range is None:
  206. continue
  207. ranges.add(character_range)
  208. return list(ranges)
  209. def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
  210. if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
  211. return 0.0
  212. decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore
  213. decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore
  214. id_a = decoder_a(errors="ignore") # type: IncrementalDecoder
  215. id_b = decoder_b(errors="ignore") # type: IncrementalDecoder
  216. character_match_count = 0 # type: int
  217. for i in range(255):
  218. to_be_decoded = bytes([i]) # type: bytes
  219. if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
  220. character_match_count += 1
  221. return character_match_count / 254
  222. def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
  223. """
  224. Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
  225. the function cp_similarity.
  226. """
  227. return (
  228. iana_name_a in IANA_SUPPORTED_SIMILAR
  229. and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
  230. )
  231. def set_logging_handler(
  232. name: str = "charset_normalizer",
  233. level: int = logging.INFO,
  234. format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
  235. ) -> None:
  236. logger = logging.getLogger(name)
  237. logger.setLevel(level)
  238. handler = logging.StreamHandler()
  239. handler.setFormatter(logging.Formatter(format_string))
  240. logger.addHandler(handler)