From 5361c2b7f48599718cdecbe50c5fdd88b28ffd79 Mon Sep 17 00:00:00 2001
From: Toddr Bot <toddbot@rinaldo.us>
Date: Mon, 16 Mar 2026 20:55:31 +0000
Subject: [PATCH] Fix buffer overflow in parse_stream when filehandle has :utf8
 layer

When a filehandle has a :utf8 PerlIO layer, Perl's read() returns
decoded characters, but SvPV() gives back the UTF-8 byte
representation which can be larger than the pre-allocated XML buffer.
Previously this caused heap corruption (double free / buffer overflow),
and a later workaround (BUFSIZE * 6 + croak) prevented the corruption
but still crashed.

Fix by re-obtaining the expat buffer at the actual byte size when the
read produces more bytes than initially allocated. This handles UTF-8
streams gracefully without wasting memory on an oversized buffer.

Fixes https://github.com/cpan-authors/XML-Parser/issues/64
(migrated from rt.cpan.org #19859)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 Expat/Expat.xs  | 15 +++++++++++----
 t/utf8_stream.t | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 4 deletions(-)
 create mode 100644 t/utf8_stream.t

diff --git a/Expat/Expat.xs b/Expat/Expat.xs
index 32fdce5..3cd1154 100644
--- a/Expat/Expat.xs
+++ b/Expat/Expat.xs
@@ -343,8 +343,8 @@ parse_stream(XML_Parser parser, SV * ioref)
   }
   else {
     tbuff = newSV(0);
-    tsiz = newSViv(BUFSIZE); /* in UTF-8 characters */
-    buffsize = BUFSIZE * 6; /* in bytes that encode an UTF-8 string */
+    tsiz = newSViv(BUFSIZE);
+    buffsize = BUFSIZE;
   }
 
   while (! done)
@@ -387,8 +387,15 @@ parse_stream(XML_Parser parser, SV * ioref)
 
 	tb = SvPV(tbuff, br);
 	if (br > 0) {
-	  if (br > buffsize)
-	    croak("The input buffer is not large enough for read UTF-8 decoded string");
+	  if (br > buffsize) {
+	    /* The byte count from SvPV can exceed buffsize when the
+	       filehandle has a :utf8 layer, since Perl reads buffsize
+	       characters but multi-byte UTF-8 chars produce more bytes.
+	       Re-obtain the buffer at the required size. */
+	    buffer = XML_GetBuffer(parser, br);
+	    if (! buffer)
+	      croak("Ran out of memory for input buffer");
+	  }
 	  Copy(tb, buffer, br, char);
 	} else
 	  done = 1;
diff --git a/t/utf8_stream.t b/t/utf8_stream.t
new file mode 100644
index 0000000..a7e55f7
--- /dev/null
+++ b/t/utf8_stream.t
@@ -0,0 +1,40 @@
+BEGIN { print "1..2\n"; }
+END { print "not ok 1\n" unless $loaded; }
+use XML::Parser;
+$loaded = 1;
+print "ok 1\n";
+
+################################################################
+# Test parsing from a filehandle with :utf8 layer
+# Regression test for rt.cpan.org #19859 / GitHub issue #64
+# A UTF-8 stream caused buffer overflow because SvPV byte count
+# could exceed the pre-allocated XML_GetBuffer size.
+
+use File::Temp qw(tempfile);
+
+# Create a temp file with UTF-8 XML content containing multi-byte chars
+my ($fh, $tmpfile) = tempfile(UNLINK => 1);
+binmode($fh, ':raw');
+# Write raw UTF-8 bytes: XML with Chinese characters (3 bytes each in UTF-8)
+# U+4E16 U+754C (世界 = "world") repeated to create substantial multi-byte content
+my $body = "\xe4\xb8\x96\xe7\x95\x8c" x 20000;  # 120000 bytes / 40000 chars of 3-byte UTF-8
+print $fh qq(<?xml version="1.0" encoding="UTF-8"?>\n<doc>$body</doc>\n);
+close($fh);
+
+my $text = '';
+my $parser = XML::Parser->new(
+    Handlers => {
+        Char => sub { $text .= $_[1]; },
+    }
+);
+
+# Open with :utf8 layer - this is what triggers the bug
+open(my $in, '<:utf8', $tmpfile) or die "Cannot open $tmpfile: $!";
+eval { $parser->parse($in); };
+close($in);
+
+if ($@ eq '' && length($text) > 0) {
+    print "ok 2\n";
+} else {
+    print "not ok 2 # $@\n";
+}

