download.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. """Download files with progress indicators.
  2. """
  3. import cgi
  4. import logging
  5. import mimetypes
  6. import os
  7. from typing import Iterable, Optional, Tuple
  8. from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response
  9. from pip._internal.cli.progress_bars import DownloadProgressProvider
  10. from pip._internal.exceptions import NetworkConnectionError
  11. from pip._internal.models.index import PyPI
  12. from pip._internal.models.link import Link
  13. from pip._internal.network.cache import is_from_cache
  14. from pip._internal.network.session import PipSession
  15. from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
  16. from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext
  17. logger = logging.getLogger(__name__)
  18. def _get_http_response_size(resp: Response) -> Optional[int]:
  19. try:
  20. return int(resp.headers["content-length"])
  21. except (ValueError, KeyError, TypeError):
  22. return None
  23. def _prepare_download(
  24. resp: Response,
  25. link: Link,
  26. progress_bar: str,
  27. ) -> Iterable[bytes]:
  28. total_length = _get_http_response_size(resp)
  29. if link.netloc == PyPI.file_storage_domain:
  30. url = link.show_url
  31. else:
  32. url = link.url_without_fragment
  33. logged_url = redact_auth_from_url(url)
  34. if total_length:
  35. logged_url = "{} ({})".format(logged_url, format_size(total_length))
  36. if is_from_cache(resp):
  37. logger.info("Using cached %s", logged_url)
  38. else:
  39. logger.info("Downloading %s", logged_url)
  40. if logger.getEffectiveLevel() > logging.INFO:
  41. show_progress = False
  42. elif is_from_cache(resp):
  43. show_progress = False
  44. elif not total_length:
  45. show_progress = True
  46. elif total_length > (40 * 1000):
  47. show_progress = True
  48. else:
  49. show_progress = False
  50. chunks = response_chunks(resp, CONTENT_CHUNK_SIZE)
  51. if not show_progress:
  52. return chunks
  53. return DownloadProgressProvider(progress_bar, max=total_length)(chunks)
  54. def sanitize_content_filename(filename: str) -> str:
  55. """
  56. Sanitize the "filename" value from a Content-Disposition header.
  57. """
  58. return os.path.basename(filename)
  59. def parse_content_disposition(content_disposition: str, default_filename: str) -> str:
  60. """
  61. Parse the "filename" value from a Content-Disposition header, and
  62. return the default filename if the result is empty.
  63. """
  64. _type, params = cgi.parse_header(content_disposition)
  65. filename = params.get("filename")
  66. if filename:
  67. # We need to sanitize the filename to prevent directory traversal
  68. # in case the filename contains ".." path parts.
  69. filename = sanitize_content_filename(filename)
  70. return filename or default_filename
  71. def _get_http_response_filename(resp: Response, link: Link) -> str:
  72. """Get an ideal filename from the given HTTP response, falling back to
  73. the link filename if not provided.
  74. """
  75. filename = link.filename # fallback
  76. # Have a look at the Content-Disposition header for a better guess
  77. content_disposition = resp.headers.get("content-disposition")
  78. if content_disposition:
  79. filename = parse_content_disposition(content_disposition, filename)
  80. ext: Optional[str] = splitext(filename)[1]
  81. if not ext:
  82. ext = mimetypes.guess_extension(resp.headers.get("content-type", ""))
  83. if ext:
  84. filename += ext
  85. if not ext and link.url != resp.url:
  86. ext = os.path.splitext(resp.url)[1]
  87. if ext:
  88. filename += ext
  89. return filename
  90. def _http_get_download(session: PipSession, link: Link) -> Response:
  91. target_url = link.url.split("#", 1)[0]
  92. resp = session.get(target_url, headers=HEADERS, stream=True)
  93. raise_for_status(resp)
  94. return resp
  95. class Downloader:
  96. def __init__(
  97. self,
  98. session: PipSession,
  99. progress_bar: str,
  100. ) -> None:
  101. self._session = session
  102. self._progress_bar = progress_bar
  103. def __call__(self, link: Link, location: str) -> Tuple[str, str]:
  104. """Download the file given by link into location."""
  105. try:
  106. resp = _http_get_download(self._session, link)
  107. except NetworkConnectionError as e:
  108. assert e.response is not None
  109. logger.critical(
  110. "HTTP error %s while getting %s", e.response.status_code, link
  111. )
  112. raise
  113. filename = _get_http_response_filename(resp, link)
  114. filepath = os.path.join(location, filename)
  115. chunks = _prepare_download(resp, link, self._progress_bar)
  116. with open(filepath, "wb") as content_file:
  117. for chunk in chunks:
  118. content_file.write(chunk)
  119. content_type = resp.headers.get("Content-Type", "")
  120. return filepath, content_type
  121. class BatchDownloader:
  122. def __init__(
  123. self,
  124. session: PipSession,
  125. progress_bar: str,
  126. ) -> None:
  127. self._session = session
  128. self._progress_bar = progress_bar
  129. def __call__(
  130. self, links: Iterable[Link], location: str
  131. ) -> Iterable[Tuple[Link, Tuple[str, str]]]:
  132. """Download the files given by links into location."""
  133. for link in links:
  134. try:
  135. resp = _http_get_download(self._session, link)
  136. except NetworkConnectionError as e:
  137. assert e.response is not None
  138. logger.critical(
  139. "HTTP error %s while getting %s",
  140. e.response.status_code,
  141. link,
  142. )
  143. raise
  144. filename = _get_http_response_filename(resp, link)
  145. filepath = os.path.join(location, filename)
  146. chunks = _prepare_download(resp, link, self._progress_bar)
  147. with open(filepath, "wb") as content_file:
  148. for chunk in chunks:
  149. content_file.write(chunk)
  150. content_type = resp.headers.get("Content-Type", "")
  151. yield link, (filepath, content_type)