From c19571de34c47de3a766541b041637ba5f716ed7 Mon Sep 17 00:00:00 2001
From: Illia Volochii <illia.volochii@gmail.com>
Date: Fri, 5 Dec 2025 16:40:41 +0200
Subject: [PATCH] Merge commit from fork

* Prevent decompression bomb for zstd in Python 3.14

* Add experimental `decompress_iter` for Brotli

* Update changes for Brotli

* Add `GzipDecoder.decompress_iter`

* Test https://github.com/python-hyper/brotlicffi/pull/207

* Pin Brotli

* Add `decompress_iter` to all decoders and make tests pass

* Pin brotlicffi to an official release

* Revert changes to response.py

* Add `max_length` parameter to all `decompress` methods

* Fix the `test_brotlipy` session

* Unset `_data` on gzip error

* Add a test for memory usage

* Test more methods

* Fix the test for `stream`

* Cover more lines with tests

* Add more coverage

* Make `read1` a bit more efficient

* Fix PyPy tests for Brotli

* Revert an unnecessarily moved check

* Add some comments

* Leave just one `self._obj.decompress` call in `GzipDecoder`

* Refactor test params

* Test reads with all data already in the decompressor

* Prevent needless copying of data decoded with `max_length`

* Rename the changed test

* Note that responses of unknown length should be streamed too

* Add a changelog entry

* Avoid returning a memory view from `BytesQueueBuffer`

* Add one more note to the changelog entry
---
 CHANGES.rst             |  22 ++++
 docs/advanced-usage.rst |   3 +-
 docs/user-guide.rst     |   4 +-
 noxfile.py              |  16 ++-
 pyproject.toml          |   5 +-
 src/urllib3/response.py | 279 ++++++++++++++++++++++++++++++++++------
 test/test_response.py   | 269 +++++++++++++++++++++++++++++++++++++-
 uv.lock                 | 177 +++++++++++--------------
 8 files changed, 621 insertions(+), 154 deletions(-)

Index: urllib3-1.22/docs/user-guide.rst
===================================================================
--- urllib3-1.22.orig/docs/user-guide.rst
+++ urllib3-1.22/docs/user-guide.rst
@@ -74,8 +74,8 @@ to a byte string representing the respon
     >>> r.data
     b'\xaa\xa5H?\x95\xe9\x9b\x11'
 
-.. note:: For larger responses, it's sometimes better to :ref:`stream <stream>`
-    the response.
+.. note:: For responses of large or unknown length, it's sometimes better to
+    :ref:`stream <stream>` the response.
 
 .. _request_data:
 
Index: urllib3-1.22/urllib3/response.py
===================================================================
--- urllib3-1.22.orig/urllib3/response.py
+++ urllib3-1.22/urllib3/response.py
@@ -1,14 +1,17 @@
 from __future__ import absolute_import
 from contextlib import contextmanager
+import collections
+import warnings
 import zlib
 import io
 import logging
 from socket import timeout as SocketTimeout
 from socket import error as SocketError
 
+from . import util
 from ._collections import HTTPHeaderDict
 from .exceptions import (
-    BodyNotHttplibCompatible, ProtocolError, DecodeError, ReadTimeoutError,
+    BodyNotHttplibCompatible, ProtocolError, DecodeError, DependencyWarning, ReadTimeoutError,
     ResponseNotChunked, IncompleteRead, InvalidHeader
 )
 from .packages.six import string_types as basestring, binary_type, PY3
@@ -23,47 +26,126 @@ class DeflateDecoder(object):
 
     def __init__(self):
         self._first_try = True
-        self._data = binary_type()
+        self._first_try_data = binary_type()
+        self._unfed_data = binary_type()
         self._obj = zlib.decompressobj()
 
     def __getattr__(self, name):
         return getattr(self._obj, name)
 
-    def decompress(self, data):
-        if not data:
+    def decompress(self, data, max_length = -1):
+        data = self._unfed_data + data
+        self._unfed_data = b""
+        if not data and not self._obj.unconsumed_tail:
             return data
+        original_max_length = max_length
+        if original_max_length < 0:
+            max_length = 0
+        elif original_max_length == 0:
+            # We should not pass 0 to the zlib decompressor because 0 is
+            # the default value that will make zlib decompress without a
+            # length limit.
+            # Data should be stored for subsequent calls.
+            self._unfed_data = data
+            return b""
 
+        # Subsequent calls always reuse `self._obj`. zlib requires
+        # passing the unconsumed tail if decompression is to continue
         if not self._first_try:
-            return self._obj.decompress(data)
+            return self._obj.decompress(self._obj.unconsumed_tail + data, max_length)
 
-        self._data += data
+        # First call tries with RFC 1950 ZLIB format
+        self._first_try_data += data
         try:
             decompressed = self._obj.decompress(data)
             if decompressed:
                 self._first_try = False
-                self._data = None
+                self._first_try_data = b""
             return decompressed
+        # On failure, it falls back to RFC 1951 DEFLATE format.
         except zlib.error:
             self._first_try = False
             self._obj = zlib.decompressobj(-zlib.MAX_WBITS)
             try:
-                return self.decompress(self._data)
+                return self.decompress(self._first_try_data, original_max_length)
             finally:
-                self._data = None
+                self._first_try_data = b""
 
+    @property
+    def has_unconsumed_tail(self):
+        return bool(self._unfed_data) or (
+            bool(self._obj.unconsumed_tail) and not self._first_try
+        )
+
+class GzipDecoderState(object):
+
+    FIRST_MEMBER = 0
+    OTHER_MEMBERS = 1
+    SWALLOW_DATA = 2
 
 class GzipDecoder(object):
 
     def __init__(self):
         self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
+        self._state = GzipDecoderState.FIRST_MEMBER
+        self._unconsumed_tail = b""
 
     def __getattr__(self, name):
         return getattr(self._obj, name)
 
-    def decompress(self, data):
-        if not data:
-            return data
-        return self._obj.decompress(data)
+    def decompress(self, data, max_length = -1):
+        ret = bytearray()
+        if self._state == GzipDecoderState.SWALLOW_DATA:
+            return bytes(ret)
+
+        if max_length == 0:
+            # We should not pass 0 to the zlib decompressor because 0 is
+            # the default value that will make zlib decompress without a
+            # length limit.
+            # Data should be stored for subsequent calls.
+            self._unconsumed_tail += data
+            return b""
+
+        # zlib requires passing the unconsumed_tail to the subsequent
+        # call if the decompression is to continue.
+        data = self._unconsumed_tail + data
+        if not data and self._obj.eof:
+            return bytes(ret)
+
+        while True:
+            try:
+                ret += self._obj.decompress(
+                    data, max(max_length - len(ret), 0)
+                )
+            except zlib.error:
+                previous_state = self._state
+                # Ignore data after the first error
+                self._state = GzipDecoderState.SWALLOW_DATA
+                self._unconsumed_tail = b""
+                if previous_state == GzipDecoderState.OTHER_MEMBERS:
+                    # Allow trailing garbage acceptable in other gzip clients
+                    return bytes(ret)
+                raise
+
+            self._unconsumed_tail = data = (
+                self._obj.unconsumed_tail or self._obj.unused_data
+            )
+            if max_length > 0 and len(ret) >= max_length:
+                break
+            data = self._obj.unused_data
+            if not data:
+                return bytes(ret)
+            # When the end of a gzip member is reached, a new decompressor
+            # must be created for unused (possibly future) data.
+            if self._obj.eof:
+                self._state = GzipDecoderState.OTHER_MEMBERS
+                self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
+
+        return bytes(ret)
+
+    @property
+    def has_unconsumed_tail(self):
+        return bool(self._unconsumed_tail)
 
 
 def _get_decoder(mode):
@@ -73,6 +155,67 @@ def _get_decoder(mode):
     return DeflateDecoder()
 
 
+class BytesQueueBuffer:
+    """Memory-efficient bytes buffer
+
+    To return decoded data in read() and still follow the BufferedIOBase API, we need a
+    buffer to always return the correct amount of bytes.
+
+    This buffer should be filled using calls to put()
+
+    Our maximum memory usage is determined by the sum of the size of:
+
+     * self.buffer, which contains the full data
+     * the largest chunk that we will copy in get()
+    """
+
+    def __init__(self):
+        self.buffer = collections.deque()
+        self._size = 0
+
+    def __len__(self):
+        return self._size
+
+    def put(self, data):
+        self.buffer.append(data)
+        self._size += len(data)
+
+    def get(self, n):
+        if n == 0:
+            return b""
+        elif not self.buffer:
+            raise RuntimeError("buffer is empty")
+        elif n < 0:
+            raise ValueError("n should be > 0")
+
+        if len(self.buffer[0]) == n and isinstance(self.buffer[0], bytes):
+            self._size -= n
+            return self.buffer.popleft()
+
+        fetched = 0
+        ret = io.BytesIO()
+        while fetched < n:
+            remaining = n - fetched
+            chunk = self.buffer.popleft()
+            chunk_length = len(chunk)
+            if remaining < chunk_length:
+                chunk = memoryview(chunk)
+                left_chunk, right_chunk = chunk[:remaining], chunk[remaining:]
+                ret.write(left_chunk)
+                self.buffer.appendleft(right_chunk)
+                self._size -= remaining
+                break
+            else:
+                ret.write(chunk)
+                self._size -= chunk_length
+            fetched += chunk_length
+
+            if not self.buffer:
+                break
+
+        return ret.getvalue()
+
+
 class HTTPResponse(io.IOBase):
     """
     HTTP Response container.
@@ -124,6 +267,7 @@ class HTTPResponse(io.IOBase):
         self.reason = reason
         self.strict = strict
         self.decode_content = decode_content
+        self._has_decoded_content = False
         self.retries = retries
         self.enforce_content_length = enforce_content_length
 
@@ -154,6 +298,9 @@ class HTTPResponse(io.IOBase):
         # Determine length of response
         self.length_remaining = self._init_length(request_method)
 
+        # Used to return the correct amount of bytes for partial read()s
+        self._decoded_buffer = BytesQueueBuffer()
+
         # If requested, preload the body.
         if preload_content and not self._body:
             self._body = self.read(decode_content=decode_content)
@@ -257,13 +404,17 @@ class HTTPResponse(io.IOBase):
         if self._decoder is None and content_encoding in self.CONTENT_DECODERS:
             self._decoder = _get_decoder(content_encoding)
 
-    def _decode(self, data, decode_content, flush_decoder):
+    def _decode(self, data, decode_content, flush_decoder, max_length = None):
         """
         Decode the data passed in and potentially flush the decoder.
         """
+
+        if max_length is None or flush_decoder:
+            max_length = -1
+
         try:
             if decode_content and self._decoder:
-                data = self._decoder.decompress(data)
+                data = self._decoder.decompress(data, max_length)
         except (IOError, zlib.error) as e:
             content_encoding = self.headers.get('content-encoding', '').lower()
             raise DecodeError(
@@ -343,6 +494,95 @@ class HTTPResponse(io.IOBase):
             if self._original_response and self._original_response.isclosed():
                 self.release_conn()
 
+    def _fp_read(self, amt):
+        """
+        Read a response with the thought that reading the number of bytes
+        larger than can fit in a 32-bit int at a time via SSL in some
+        known cases leads to an overflow error that has to be prevented
+        if `amt` or `self.length_remaining` indicate that a problem may
+        happen.
+
+        The known cases:
+          * 3.8 <= CPython < 3.9.7 because of a bug
+            https://github.com/urllib3/urllib3/issues/2513#issuecomment-1152559900.
+          * urllib3 injected with pyOpenSSL-backed SSL-support.
+          * CPython < 3.10 only when `amt` does not fit 32-bit int.
+        """
+        assert self._fp
+        c_int_max = 2 ** 31 - 1
+        if (
+            (
+                (amt and amt > c_int_max)
+                or (self.length_remaining and self.length_remaining > c_int_max)
+            )
+            and not util.IS_SECURETRANSPORT
+            and (util.IS_PYOPENSSL or sys.version_info < (3, 10))
+        ):
+            buffer = io.BytesIO()
+            # Besides `max_chunk_amt` being a maximum chunk size, it
+            # affects memory overhead of reading a response by this
+            # method in CPython.
+            # `c_int_max` equal to 2 GiB - 1 byte is the actual maximum
+            # chunk size that does not lead to an overflow error, but
+            # 256 MiB is a compromise.
+            max_chunk_amt = 2 ** 28
+            while amt is None or amt != 0:
+                if amt is not None:
+                    chunk_amt = min(amt, max_chunk_amt)
+                    amt -= chunk_amt
+                else:
+                    chunk_amt = max_chunk_amt
+                data = self._fp.read(chunk_amt)
+                if not data:
+                    break
+                buffer.write(data)
+                del data  # to reduce peak memory usage by `max_chunk_amt`.
+            return buffer.getvalue()
+        else:
+            # StringIO doesn't like amt=None
+            return self._fp.read(amt) if amt is not None else self._fp.read()
+
+    def _raw_read(
+        self,
+        amt=None,
+    ):
+        """
+        Reads `amt` of bytes from the socket.
+        """
+        if self._fp is None:
+            return
+
+        fp_closed = getattr(self._fp, "closed", False)
+
+        with self._error_catcher():
+            data = self._fp_read(amt) if not fp_closed else b""
+            if amt is not None and amt != 0 and not data:
+                # Platform-specific: Buggy versions of Python.
+                # Close the connection when no data is returned
+                #
+                # This is redundant to what httplib/http.client _should_
+                # already do.  However, versions of python released before
+                # December 15, 2012 (http://bugs.python.org/issue16298) do
+                # not properly close the connection in all cases. There is
+                # no harm in redundantly calling close.
+                self._fp.close()
+                if self.enforce_content_length and self.length_remaining not in (
+                    0,
+                    None,
+                ):
+                    # This is an edge case that httplib failed to cover due
+                    # to concerns of backward compatibility. We're
+                    # addressing it here to make sure IncompleteRead is
+                    # raised during streaming, so all calls with incorrect
+                    # Content-Length are caught.
+                    raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
+
+        if data:
+            self._fp_bytes_read += len(data)
+            if self.length_remaining is not None:
+                self.length_remaining -= len(data)
+        return data
+
     def read(self, amt=None, decode_content=None, cache_content=False):
         """
         Similar to :meth:`httplib.HTTPResponse.read`, but with two additional
@@ -368,47 +608,70 @@ class HTTPResponse(io.IOBase):
         if decode_content is None:
             decode_content = self.decode_content
 
-        if self._fp is None:
-            return
-
-        flush_decoder = False
-        data = None
-
-        with self._error_catcher():
-            if amt is None:
-                # cStringIO doesn't like amt=None
-                data = self._fp.read()
-                flush_decoder = True
-            else:
-                cache_content = False
-                data = self._fp.read(amt)
-                if amt != 0 and not data:  # Platform-specific: Buggy versions of Python.
-                    # Close the connection when no data is returned
-                    #
-                    # This is redundant to what httplib/http.client _should_
-                    # already do.  However, versions of python released before
-                    # December 15, 2012 (http://bugs.python.org/issue16298) do
-                    # not properly close the connection in all cases. There is
-                    # no harm in redundantly calling close.
-                    self._fp.close()
-                    flush_decoder = True
-                    if self.enforce_content_length and self.length_remaining not in (0, None):
-                        # This is an edge case that httplib failed to cover due
-                        # to concerns of backward compatibility. We're
-                        # addressing it here to make sure IncompleteRead is
-                        # raised during streaming, so all calls with incorrect
-                        # Content-Length are caught.
-                        raise IncompleteRead(self._fp_bytes_read, self.length_remaining)
-
-        if data:
-            self._fp_bytes_read += len(data)
-            if self.length_remaining is not None:
-                self.length_remaining -= len(data)
+        if amt and amt < 0:
+            # Negative numbers and `None` should be treated the same.
+            amt = None
+        elif amt is not None:
+            cache_content = False
+
+            if self._decoder and self._decoder.has_unconsumed_tail:
+                decoded_data = self._decode(
+                    b"",
+                    decode_content,
+                    flush_decoder=False,
+                    max_length=amt - len(self._decoded_buffer),
+                )
+                self._decoded_buffer.put(decoded_data)
+            if len(self._decoded_buffer) >= amt:
+                return self._decoded_buffer.get(amt)
+
+        data = self._raw_read(amt)
+
+        flush_decoder = amt is None or (amt != 0 and not data)
+
+        if (
+            not data
+            and len(self._decoded_buffer) == 0
+            and not (self._decoder and self._decoder.has_unconsumed_tail)
+        ):
+            return data
 
+        if amt is None:
             data = self._decode(data, decode_content, flush_decoder)
 
             if cache_content:
                 self._body = data
+        else:
+            # do not waste memory on buffer when not decoding
+            if not decode_content:
+                if self._has_decoded_content:
+                    raise RuntimeError(
+                        "Calling read(decode_contennt=False) is not supported after "
+                        "read(decode_content=True) was called."
+                    )
+                return data
+
+            decoded_data = self._decode(
+                data,
+                decode_content,
+                flush_decoder,
+                max_length=amt - len(self._decoded_buffer),
+            )
+            self._decoded_buffer.put(decoded_data)
+
+            while len(self._decoded_buffer) < amt and data:
+                # TODO make sure to initially read enough data to get past the headers
+                # For example, the GZ file header takes 10 bytes, we don't want to read
+                # it one byte at a time
+                data = self._raw_read(amt)
+                decoded_data = self._decode(
+                    data,
+                    decode_content,
+                    flush_decoder,
+                    max_length=amt - len(self._decoded_buffer),
+                )
+                self._decoded_buffer.put(decoded_data)
+            data = self._decoded_buffer.get(amt)
 
         return data
 
@@ -432,7 +695,11 @@ class HTTPResponse(io.IOBase):
             for line in self.read_chunked(amt, decode_content=decode_content):
                 yield line
         else:
-            while not is_fp_closed(self._fp):
+            while(
+                not is_fp_closed(self._fp)
+                or len(self._decoded_buffer) > 0
+                or (self._decoder and self._decoder.has_unconsumed_tail)
+            ):
                 data = self.read(amt=amt, decode_content=decode_content)
 
                 if data:
@@ -600,7 +867,7 @@ class HTTPResponse(io.IOBase):
                     break
                 chunk = self._handle_chunk(amt)
                 decoded = self._decode(chunk, decode_content=decode_content,
-                                       flush_decoder=False)
+                                       flush_decoder=False, max_length=amt)
                 if decoded:
                     yield decoded
 
Index: urllib3-1.22/test/test_response.py
===================================================================
--- urllib3-1.22.orig/test/test_response.py
+++ urllib3-1.22/test/test_response.py
@@ -1,10 +1,12 @@
 import socket
+import gzip
+import zlib
 
 from io import BytesIO, BufferedReader
 
 import pytest
 
-from urllib3.response import HTTPResponse
+from urllib3.response import HTTPResponse, BytesQueueBuffer
 from urllib3.exceptions import (
     DecodeError, ResponseNotChunked, ProtocolError, InvalidHeader
 )
@@ -36,6 +38,20 @@ def sock():
     yield s
     s.close()
 
+def deflate2_compress(data):
+    compressor = zlib.compressobj(6, zlib.DEFLATED, -zlib.MAX_WBITS)
+    return compressor.compress(data) + compressor.flush()
+
+
+class TestBytesQueueBuffer:
+    def test_memory_usage_single_chunk(
+        self
+    ):
+        buffer = BytesQueueBuffer()
+        chunk = bytes(10 * 2**20) # 10 MiB
+        buffer.put(chunk)
+        assert buffer.get(len(buffer)) is chunk
+
 
 class TestLegacyResponse(object):
     def test_getheaders(self):
@@ -120,12 +136,7 @@ class TestResponse(object):
         r = HTTPResponse(fp, headers={'content-encoding': 'deflate'},
                          preload_content=False)
 
-        assert r.read(3) == b''
-        # Buffer in case we need to switch to the raw stream
-        assert r._decoder._data is not None
         assert r.read(1) == b'f'
-        # Now that we've decoded data, we just stream through the decoder
-        assert r._decoder._data is None
         assert r.read(2) == b'oo'
         assert r.read() == b''
         assert r.read() == b''
@@ -140,10 +151,7 @@ class TestResponse(object):
         r = HTTPResponse(fp, headers={'content-encoding': 'deflate'},
                          preload_content=False)
 
-        assert r.read(1) == b''
         assert r.read(1) == b'f'
-        # Once we've decoded data, we just stream to the decoder; no buffering
-        assert r._decoder._data is None
         assert r.read(2) == b'oo'
         assert r.read() == b''
         assert r.read() == b''
@@ -158,7 +166,6 @@ class TestResponse(object):
         r = HTTPResponse(fp, headers={'content-encoding': 'gzip'},
                          preload_content=False)
 
-        assert r.read(11) == b''
         assert r.read(1) == b'f'
         assert r.read(2) == b'oo'
         assert r.read() == b''
@@ -248,8 +255,8 @@ class TestResponse(object):
                             preload_content=False)
         stream = resp.stream(2)
 
-        assert next(stream) == b'f'
-        assert next(stream) == b'oo'
+        assert next(stream) == b'fo'
+        assert next(stream) == b'o'
         with pytest.raises(StopIteration):
             next(stream)
 
@@ -280,6 +287,7 @@ class TestResponse(object):
         import zlib
 
         NUMBER_OF_READS = 10
+        PART_SIZE = 64
 
         class MockCompressedDataReading(BytesIO):
             """
@@ -306,7 +314,7 @@ class TestResponse(object):
         fp = MockCompressedDataReading(ZLIB_PAYLOAD, payload_part_size)
         resp = HTTPResponse(fp, headers={'content-encoding': 'deflate'},
                             preload_content=False)
-        stream = resp.stream()
+        stream = resp.stream(PART_SIZE)
 
         parts_positions = [(part, resp.tell()) for part in stream]
         end_of_stream = resp.tell()
@@ -321,12 +329,28 @@ class TestResponse(object):
         assert uncompressed_data == payload
 
         # Check that the positions in the stream are correct
-        expected = [(i+1)*payload_part_size for i in range(NUMBER_OF_READS)]
-        assert expected == list(positions)
+        # It is difficult to determine programatically what the positions
+        # returned by `tell` will be because the `HTTPResponse.read` method may
+        # call socket `read` a couple of times if it doesn't have enough data
+        # in the buffer or not call socket `read` at all if it has enough. All
+        # this depends on the message, how it was compressed, what is
+        # `PART_SIZE` and `payload_part_size`.
+        # So for simplicity the expected values are hardcoded.
+        expected = (92, 184, 230, 276, 322, 368, 414, 460)
+        assert expected == positions
 
         # Check that the end of the stream is in the correct place
         assert len(ZLIB_PAYLOAD) == end_of_stream
 
+        # Check that all parts have expected length
+        expected_last_part_size = len(uncompressed_data) % PART_SIZE
+        whole_parts = len(uncompressed_data) // PART_SIZE
+        if expected_last_part_size == 0:
+            expected_lengths = [PART_SIZE] * whole_parts
+        else:
+            expected_lengths = [PART_SIZE] * whole_parts + [expected_last_part_size]
+        assert expected_lengths == [len(part) for part in parts]
+
     def test_deflate_streaming(self):
         import zlib
         data = zlib.compress(b'foo')
@@ -336,8 +360,8 @@ class TestResponse(object):
                             preload_content=False)
         stream = resp.stream(2)
 
-        assert next(stream) == b'f'
-        assert next(stream) == b'oo'
+        assert next(stream) == b'fo'
+        assert next(stream) == b'o'
         with pytest.raises(StopIteration):
             next(stream)
 
@@ -352,8 +376,8 @@ class TestResponse(object):
                             preload_content=False)
         stream = resp.stream(2)
 
-        assert next(stream) == b'f'
-        assert next(stream) == b'oo'
+        assert next(stream) == b'fo'
+        assert next(stream) == b'o'
         with pytest.raises(StopIteration):
             next(stream)
 
Index: urllib3-1.22/test/with_dummyserver/test_socketlevel.py
===================================================================
--- urllib3-1.22.orig/test/with_dummyserver/test_socketlevel.py
+++ urllib3-1.22/test/with_dummyserver/test_socketlevel.py
@@ -1397,9 +1397,6 @@ class TestBadContentLength(SocketDummySe
         get_response = conn.request('GET', url='/', preload_content=False,
                                     enforce_content_length=True)
         data = get_response.stream(100)
-        # Read "good" data before we try to read again.
-        # This won't trigger till generator is exhausted.
-        next(data)
         try:
             next(data)
             self.assertFail()
