legacy.py 3.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. import warnings
  2. from typing import Dict, Optional, Union
  3. from .api import from_bytes, from_fp, from_path, normalize
  4. from .constant import CHARDET_CORRESPONDENCE
  5. from .models import CharsetMatch, CharsetMatches
  6. def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]:
  7. """
  8. chardet legacy method
  9. Detect the encoding of the given byte string. It should be mostly backward-compatible.
  10. Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
  11. This function is deprecated and should be used to migrate your project easily, consult the documentation for
  12. further information. Not planned for removal.
  13. :param byte_str: The byte sequence to examine.
  14. """
  15. if not isinstance(byte_str, (bytearray, bytes)):
  16. raise TypeError( # pragma: nocover
  17. "Expected object of type bytes or bytearray, got: "
  18. "{0}".format(type(byte_str))
  19. )
  20. if isinstance(byte_str, bytearray):
  21. byte_str = bytes(byte_str)
  22. r = from_bytes(byte_str).best()
  23. encoding = r.encoding if r is not None else None
  24. language = r.language if r is not None and r.language != "Unknown" else ""
  25. confidence = 1.0 - r.chaos if r is not None else None
  26. # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
  27. # but chardet does return 'utf-8-sig' and it is a valid codec name.
  28. if r is not None and encoding == "utf_8" and r.bom:
  29. encoding += "_sig"
  30. return {
  31. "encoding": encoding
  32. if encoding not in CHARDET_CORRESPONDENCE
  33. else CHARDET_CORRESPONDENCE[encoding],
  34. "language": language,
  35. "confidence": confidence,
  36. }
  37. class CharsetNormalizerMatch(CharsetMatch):
  38. pass
  39. class CharsetNormalizerMatches(CharsetMatches):
  40. @staticmethod
  41. def from_fp(*args, **kwargs): # type: ignore
  42. warnings.warn( # pragma: nocover
  43. "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
  44. "and scheduled to be removed in 3.0",
  45. DeprecationWarning,
  46. )
  47. return from_fp(*args, **kwargs) # pragma: nocover
  48. @staticmethod
  49. def from_bytes(*args, **kwargs): # type: ignore
  50. warnings.warn( # pragma: nocover
  51. "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
  52. "and scheduled to be removed in 3.0",
  53. DeprecationWarning,
  54. )
  55. return from_bytes(*args, **kwargs) # pragma: nocover
  56. @staticmethod
  57. def from_path(*args, **kwargs): # type: ignore
  58. warnings.warn( # pragma: nocover
  59. "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
  60. "and scheduled to be removed in 3.0",
  61. DeprecationWarning,
  62. )
  63. return from_path(*args, **kwargs) # pragma: nocover
  64. @staticmethod
  65. def normalize(*args, **kwargs): # type: ignore
  66. warnings.warn( # pragma: nocover
  67. "staticmethod from_fp, from_bytes, from_path and normalize are deprecated "
  68. "and scheduled to be removed in 3.0",
  69. DeprecationWarning,
  70. )
  71. return normalize(*args, **kwargs) # pragma: nocover
  72. class CharsetDetector(CharsetNormalizerMatches):
  73. pass
  74. class CharsetDoctor(CharsetNormalizerMatches):
  75. pass