download.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. """Download files with progress indicators.
  2. """
  3. import cgi
  4. import logging
  5. import mimetypes
  6. import os
  7. from typing import Iterable, Optional, Tuple
  8. from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response
  9. from pip._internal.cli.progress_bars import get_download_progress_renderer
  10. from pip._internal.exceptions import NetworkConnectionError
  11. from pip._internal.models.index import PyPI
  12. from pip._internal.models.link import Link
  13. from pip._internal.network.cache import is_from_cache
  14. from pip._internal.network.session import PipSession
  15. from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
  16. from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext
  17. logger = logging.getLogger(__name__)
  18. def _get_http_response_size(resp: Response) -> Optional[int]:
  19. try:
  20. return int(resp.headers["content-length"])
  21. except (ValueError, KeyError, TypeError):
  22. return None
  23. def _prepare_download(
  24. resp: Response,
  25. link: Link,
  26. progress_bar: str,
  27. ) -> Iterable[bytes]:
  28. total_length = _get_http_response_size(resp)
  29. if link.netloc == PyPI.file_storage_domain:
  30. url = link.show_url
  31. else:
  32. url = link.url_without_fragment
  33. logged_url = redact_auth_from_url(url)
  34. if total_length:
  35. logged_url = "{} ({})".format(logged_url, format_size(total_length))
  36. if is_from_cache(resp):
  37. logger.info("Using cached %s", logged_url)
  38. else:
  39. logger.info("Downloading %s", logged_url)
  40. if logger.getEffectiveLevel() > logging.INFO:
  41. show_progress = False
  42. elif is_from_cache(resp):
  43. show_progress = False
  44. elif not total_length:
  45. show_progress = True
  46. elif total_length > (40 * 1000):
  47. show_progress = True
  48. else:
  49. show_progress = False
  50. chunks = response_chunks(resp, CONTENT_CHUNK_SIZE)
  51. if not show_progress:
  52. return chunks
  53. renderer = get_download_progress_renderer(bar_type=progress_bar, size=total_length)
  54. return renderer(chunks)
  55. def sanitize_content_filename(filename: str) -> str:
  56. """
  57. Sanitize the "filename" value from a Content-Disposition header.
  58. """
  59. return os.path.basename(filename)
  60. def parse_content_disposition(content_disposition: str, default_filename: str) -> str:
  61. """
  62. Parse the "filename" value from a Content-Disposition header, and
  63. return the default filename if the result is empty.
  64. """
  65. _type, params = cgi.parse_header(content_disposition)
  66. filename = params.get("filename")
  67. if filename:
  68. # We need to sanitize the filename to prevent directory traversal
  69. # in case the filename contains ".." path parts.
  70. filename = sanitize_content_filename(filename)
  71. return filename or default_filename
  72. def _get_http_response_filename(resp: Response, link: Link) -> str:
  73. """Get an ideal filename from the given HTTP response, falling back to
  74. the link filename if not provided.
  75. """
  76. filename = link.filename # fallback
  77. # Have a look at the Content-Disposition header for a better guess
  78. content_disposition = resp.headers.get("content-disposition")
  79. if content_disposition:
  80. filename = parse_content_disposition(content_disposition, filename)
  81. ext: Optional[str] = splitext(filename)[1]
  82. if not ext:
  83. ext = mimetypes.guess_extension(resp.headers.get("content-type", ""))
  84. if ext:
  85. filename += ext
  86. if not ext and link.url != resp.url:
  87. ext = os.path.splitext(resp.url)[1]
  88. if ext:
  89. filename += ext
  90. return filename
  91. def _http_get_download(session: PipSession, link: Link) -> Response:
  92. target_url = link.url.split("#", 1)[0]
  93. resp = session.get(target_url, headers=HEADERS, stream=True)
  94. raise_for_status(resp)
  95. return resp
  96. class Downloader:
  97. def __init__(
  98. self,
  99. session: PipSession,
  100. progress_bar: str,
  101. ) -> None:
  102. self._session = session
  103. self._progress_bar = progress_bar
  104. def __call__(self, link: Link, location: str) -> Tuple[str, str]:
  105. """Download the file given by link into location."""
  106. try:
  107. resp = _http_get_download(self._session, link)
  108. except NetworkConnectionError as e:
  109. assert e.response is not None
  110. logger.critical(
  111. "HTTP error %s while getting %s", e.response.status_code, link
  112. )
  113. raise
  114. filename = _get_http_response_filename(resp, link)
  115. filepath = os.path.join(location, filename)
  116. chunks = _prepare_download(resp, link, self._progress_bar)
  117. with open(filepath, "wb") as content_file:
  118. for chunk in chunks:
  119. content_file.write(chunk)
  120. content_type = resp.headers.get("Content-Type", "")
  121. return filepath, content_type
  122. class BatchDownloader:
  123. def __init__(
  124. self,
  125. session: PipSession,
  126. progress_bar: str,
  127. ) -> None:
  128. self._session = session
  129. self._progress_bar = progress_bar
  130. def __call__(
  131. self, links: Iterable[Link], location: str
  132. ) -> Iterable[Tuple[Link, Tuple[str, str]]]:
  133. """Download the files given by links into location."""
  134. for link in links:
  135. try:
  136. resp = _http_get_download(self._session, link)
  137. except NetworkConnectionError as e:
  138. assert e.response is not None
  139. logger.critical(
  140. "HTTP error %s while getting %s",
  141. e.response.status_code,
  142. link,
  143. )
  144. raise
  145. filename = _get_http_response_filename(resp, link)
  146. filepath = os.path.join(location, filename)
  147. chunks = _prepare_download(resp, link, self._progress_bar)
  148. with open(filepath, "wb") as content_file:
  149. for chunk in chunks:
  150. content_file.write(chunk)
  151. content_type = resp.headers.get("Content-Type", "")
  152. yield link, (filepath, content_type)