From c3c931f5608da23a196b216edfcc2af5b626fc9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sa=C3=BAl=20Ibarra=20Corretg=C3=A9?= <s@saghul.net>
Date: Mon, 4 Aug 2025 10:27:07 +0200
Subject: [PATCH] Refactor channel destruction logic

- Use ares_queue_wait_empty to wait for queries to be complete before
  destruction
- Make sure NO queries are cancelled as side effects on __del__
- Start the destruction thread early, as soon as a channel is created

Cancelling pending queries while in a query callback seemingly causes
heap corruption or double-free bugs, so delay the operation until no
Python code if using the channel anymore, that is, the destructor
thread.

Fixes: https://github.com/aio-libs/aiodns/issues/175
Fixes: https://github.com/saghul/pycares/issues/248
---
 docs/channel.rst             | 13 ------
 src/_cffi_src/build_cares.py | 78 +++++++++++++++++++++++-------------
 src/pycares/__init__.py      | 61 ++++++++++++----------------
 3 files changed, 76 insertions(+), 76 deletions(-)

Index: pycares-4.9.0/docs/channel.rst
===================================================================
--- pycares-4.9.0.orig/docs/channel.rst
+++ pycares-4.9.0/docs/channel.rst
@@ -77,25 +77,6 @@
         While channels will attempt automatic cleanup during garbage collection, explicit
         closing is safer as it gives you control over when resources are released.
 
-    .. warning::
-        The channel destruction mechanism has a limited throughput of 60 channels per minute
-        (one channel per second) to ensure thread safety and prevent use-after-free errors
-        in c-ares. This means:
-
-        - Avoid creating transient channels for individual queries
-        - Reuse channel instances whenever possible
-        - For applications with high query volume, use a single long-lived channel
-        - If you must create multiple channels, consider pooling them
-
-        Creating and destroying channels rapidly will result in a backlog as the destruction
-        queue processes channels sequentially with a 1-second delay between each.
-
-    The Channel class supports the context manager protocol for automatic cleanup::
-
-        with pycares.Channel() as channel:
-            channel.query('example.com', pycares.QUERY_TYPE_A, callback)
-        # Channel is automatically closed when exiting the context
-
     .. py:method:: getaddrinfo(host, port, callback, family=0, type=0, proto=0, flags=0)
 
         :param string host: Hostname to resolve.
Index: pycares-4.9.0/src/_cffi_src/build_cares.py
===================================================================
--- pycares-4.9.0.orig/src/_cffi_src/build_cares.py
+++ pycares-4.9.0/src/_cffi_src/build_cares.py
@@ -90,34 +90,6 @@ struct sockaddr_in6 {
 typedef int... ares_socket_t;
 typedef int... ares_socklen_t;
 
-#define ARES_SUCCESS            ...
-
-#define ARES_ENODATA            ...
-#define ARES_EFORMERR           ...
-#define ARES_ESERVFAIL          ...
-#define ARES_ENOTFOUND          ...
-#define ARES_ENOTIMP            ...
-#define ARES_EREFUSED           ...
-#define ARES_EBADQUERY          ...
-#define ARES_EBADNAME           ...
-#define ARES_EBADFAMILY         ...
-#define ARES_EBADRESP           ...
-#define ARES_ECONNREFUSED       ...
-#define ARES_ETIMEOUT           ...
-#define ARES_EOF                ...
-#define ARES_EFILE              ...
-#define ARES_ENOMEM             ...
-#define ARES_EDESTRUCTION       ...
-#define ARES_EBADSTR            ...
-#define ARES_EBADFLAGS          ...
-#define ARES_ENONAME            ...
-#define ARES_EBADHINTS          ...
-#define ARES_ENOTINITIALIZED    ...
-#define ARES_ELOADIPHLPAPI           ...
-#define ARES_EADDRGETNETWORKPARAMS   ...
-#define ARES_ECANCELLED         ...
-#define ARES_ESERVICE           ...
-
 #define ARES_FLAG_USEVC         ...
 #define ARES_FLAG_PRIMARY       ...
 #define ARES_FLAG_IGNTC         ...
@@ -229,6 +201,54 @@ struct ares_server_failover_options {
   size_t         retry_delay;
 };
 
+typedef enum {
+  ARES_SUCCESS = 0,
+
+  /* Server error codes (ARES_ENODATA indicates no relevant answer) */
+  ARES_ENODATA   = 1,
+  ARES_EFORMERR  = 2,
+  ARES_ESERVFAIL = 3,
+  ARES_ENOTFOUND = 4,
+  ARES_ENOTIMP   = 5,
+  ARES_EREFUSED  = 6,
+
+  /* Locally generated error codes */
+  ARES_EBADQUERY    = 7,
+  ARES_EBADNAME     = 8,
+  ARES_EBADFAMILY   = 9,
+  ARES_EBADRESP     = 10,
+  ARES_ECONNREFUSED = 11,
+  ARES_ETIMEOUT     = 12,
+  ARES_EOF          = 13,
+  ARES_EFILE        = 14,
+  ARES_ENOMEM       = 15,
+  ARES_EDESTRUCTION = 16,
+  ARES_EBADSTR      = 17,
+
+  /* ares_getnameinfo error codes */
+  ARES_EBADFLAGS = 18,
+
+  /* ares_getaddrinfo error codes */
+  ARES_ENONAME   = 19,
+  ARES_EBADHINTS = 20,
+
+  /* Uninitialized library error code */
+  ARES_ENOTINITIALIZED = 21, /* introduced in 1.7.0 */
+
+  /* ares_library_init error codes */
+  ARES_ELOADIPHLPAPI         = 22, /* introduced in 1.7.0 */
+  ARES_EADDRGETNETWORKPARAMS = 23, /* introduced in 1.7.0 */
+
+  /* More error codes */
+  ARES_ECANCELLED = 24, /* introduced in 1.7.0 */
+
+  /* More ares_getaddrinfo error codes */
+  ARES_ESERVICE = 25, /* ares_getaddrinfo() was passed a text service name that
+                       * is not recognized. introduced in 1.16.0 */
+
+  ARES_ENOSERVER = 26 /* No DNS servers were configured */
+} ares_status_t;
+
 /*! Values for ARES_OPT_EVENT_THREAD */
 typedef enum {
   /*! Default (best choice) event system */
@@ -597,6 +617,8 @@ const char *ares_inet_ntop(int af, const
 int ares_inet_pton(int af, const char *src, void *dst);
 
 ares_bool_t ares_threadsafety(void);
+
+ares_status_t ares_queue_wait_empty(ares_channel channel, int timeout_ms);
 """
 
 CALLBACKS = """
Index: pycares-4.9.0/src/pycares/__init__.py
===================================================================
--- pycares-4.9.0.orig/src/pycares/__init__.py
+++ pycares-4.9.0/src/pycares/__init__.py
@@ -12,10 +12,8 @@ from ._version import __version__
 import socket
 import math
 import threading
-import time
 import weakref
 from collections.abc import Callable, Iterable
-from contextlib import suppress
 from typing import Any, Callable, Optional, Dict, Union
 from queue import SimpleQueue
 
@@ -344,7 +342,7 @@ class _ChannelShutdownManager:
     def __init__(self) -> None:
         self._queue: SimpleQueue = SimpleQueue()
         self._thread: Optional[threading.Thread] = None
-        self._thread_started = False
+        self._start_lock = threading.Lock()
 
     def _run_safe_shutdown_loop(self) -> None:
         """Process channel destruction requests from the queue."""
@@ -352,16 +350,27 @@ class _ChannelShutdownManager:
             # Block forever until we get a channel to destroy
             channel = self._queue.get()
 
-            # Sleep for 1 second to ensure c-ares has finished processing
-            # Its important that c-ares is past this critcial section
-            # so we use a delay to ensure it has time to finish processing
-            # https://github.com/c-ares/c-ares/blob/4f42928848e8b73d322b15ecbe3e8d753bf8734e/src/lib/ares_process.c#L1422
-            time.sleep(1.0)
+            # Cancel all pending queries - this will trigger callbacks with ARES_ECANCELLED
+            _lib.ares_cancel(channel[0])
+
+            # Wait for all queries to finish
+            _lib.ares_queue_wait_empty(channel[0], -1)
 
             # Destroy the channel
             if _lib is not None and channel is not None:
                 _lib.ares_destroy(channel[0])
 
+    def start(self) -> None:
+        """Start the background thread if not already started."""
+        if self._thread is not None:
+            return
+        with self._start_lock:
+            if self._thread is not None:
+                # Started by another thread while waiting for the lock
+                return
+            self._thread = threading.Thread(target=self._run_safe_shutdown_loop, daemon=True)
+            self._thread.start()
+
     def destroy_channel(self, channel) -> None:
         """
         Schedule channel destruction on the background thread with a safety delay.
@@ -369,17 +378,10 @@ class _ChannelShutdownManager:
         Thread Safety and Synchronization:
         This method uses SimpleQueue which is thread-safe for putting items
         from multiple threads. The background thread processes channels
-        sequentially with a 1-second delay before each destruction.
+        sequentially waiting for queries to end before each destruction.
         """
-        # Put the channel in the queue
         self._queue.put(channel)
 
-        # Start the background thread if not already started
-        if not self._thread_started:
-            self._thread_started = True
-            self._thread = threading.Thread(target=self._run_safe_shutdown_loop, daemon=True)
-            self._thread.start()
-
 
 # Global shutdown manager instance
 _shutdown_manager = _ChannelShutdownManager()
@@ -516,11 +518,12 @@ class Channel:
         self.close()
         return False
 
+        # Ensure the shutdown thread is started
+        _shutdown_manager.start()
+
     def __del__(self) -> None:
         """Ensure the channel is destroyed when the object is deleted."""
-        if self._channel is not None:
-            # Schedule channel destruction using the global shutdown manager
-            self._schedule_destruction()
+        self.close()
 
     def _create_callback_handle(self, callback_data):
         """
@@ -764,24 +767,12 @@ class Channel:
             # Already destroyed
             return
 
-        # Cancel all pending queries - this will trigger callbacks with ARES_ECANCELLED
-        self.cancel()
+        # NB: don't cancel queries here, it may lead to problem if done from a
+        # query callback.
 
         # Schedule channel destruction
-        self._schedule_destruction()
-
-    def _schedule_destruction(self) -> None:
-        """Schedule channel destruction using the global shutdown manager."""
-        if self._channel is None:
-            return
-        channel = self._channel
-        self._channel = None
-        # Can't start threads during interpreter shutdown
-        # The channel will be cleaned up by the OS
-        # TODO: Change to PythonFinalizationError when Python 3.12 support is dropped
-        with suppress(RuntimeError):
-            _shutdown_manager.destroy_channel(channel)
-
+        channel, self._channel = self._channel, None
+        _shutdown_manager.destroy_channel(channel)
 
 
 class AresResult:
