From f764f220a3edae69e8c6eabe4e87fbe2dfce1880 Mon Sep 17 00:00:00 2001
From: Alexander Edland <s@ocv.me>
Date: Fri, 18 Apr 2025 10:15:55 +0000
Subject: [PATCH] main/unzip: fix zipbomb false-positive

Add another patch from madler's fork, fixing a false-positive when
extracting a mixed-zip/zip64 streaming-mode zipfile. The previous
patchset assumed that data-descriptors would always use 64bit fields
for zip64 entries, but the zip-spec can be interpreted to expect
32bit values until one of those overflow (libarchive and others).

https://github.com/madler/unzip/commit/af0d07f95809
https://bugzilla.redhat.com/show_bug.cgi?id=2360938
---
 main/unzip/APKBUILD             |   6 +-
 main/unzip/zipbomb-part7.patch  | 172 ++++++++++++++++++++++++++++++++
 main/unzip/zipbomb-switch.patch |  29 ++----
 3 files changed, 184 insertions(+), 23 deletions(-)
 create mode 100644 main/unzip/zipbomb-part7.patch

diff --git a/main/unzip/APKBUILD b/main/unzip/APKBUILD
index ab0a703b2fc..131e92c4f95 100644
--- a/main/unzip/APKBUILD
+++ b/main/unzip/APKBUILD
@@ -3,7 +3,7 @@
 pkgname=unzip
 pkgver=6.0
 _pkgver=${pkgver//./}
-pkgrel=15
+pkgrel=16
 pkgdesc="Extract PKZIP-compatible .zip files"
 url="http://www.info-zip.org/UnZip.html"
 arch="all"
@@ -39,6 +39,7 @@ source="https://dev.alpinelinux.org/archive/unzip/unzip$_pkgver.tgz
 	zipbomb-part4.patch
 	zipbomb-part5.patch
 	zipbomb-part6.patch
+	zipbomb-part7.patch
 	zipbomb-switch.patch
 	"
 builddir="$srcdir/$pkgname$_pkgver"
@@ -119,5 +120,6 @@ e20e97722e0daf48b97df540added603325d356c6597634afd694af3972bb62952dd0f92c10d98f8
 27d45a25a6a51415af609a4fdefcb7c95a1105d511a6e18e2a7464e9d3773ba2ccb25f138a3cc6ddc6e5e9c558b633ee60d273cebf562c2a7d1e99d3f229d1ba  zipbomb-part4.patch
 48875d7e08d669637e26a7e800f8b2a3812d477e6f249c8d4962fdf93ba6d346f5b22b83d82cb65317b506dff84c441d42c0fe7d1c042a065619d39bdf25fdd0  zipbomb-part5.patch
 a788d57fe0fb9ae6106381d2a8fe566aa35bb037012139dc7c283fe5eb316056835dffa9ea9778c15a5b39e50a75329a135a0dffdfc6a53d575ef2013b1d478a  zipbomb-part6.patch
-d86aba51101fdbe855c35f034d33d65a79c5c707d01de4709619f5d1316185777048b72c293f9506186677bcecf54a808e106ad59bb36835ef80615641c85d63  zipbomb-switch.patch
+18837ceb96dd86d5b24435bbb04b36b062449b50e65da8ab6f4fbd1082cc4afea6c42a9f363fc790995ea5450e3d6c36e749aaa8fd4b7dd8415e304dadcb5290  zipbomb-part7.patch
+c1383b1886ec433e22adb24b00fc5de0d7792cf727afef62d54940a4bb39f2cd6f465f28ef0b9ab68a6798f3df7bb227fa1ec162fd4d4c671c9e4b46a1c2ef42  zipbomb-switch.patch
 "
diff --git a/main/unzip/zipbomb-part7.patch b/main/unzip/zipbomb-part7.patch
new file mode 100644
index 00000000000..4edc15200b0
--- /dev/null
+++ b/main/unzip/zipbomb-part7.patch
@@ -0,0 +1,172 @@
+From af0d07f95809653b669d88aa0f424c6d5aa48ba0 Mon Sep 17 00:00:00 2001
+From: Mark Adler <fork@madler.net>
+Date: Sat, 2 Jul 2022 14:35:04 -0700
+Subject: [PATCH] Be more liberal in the acceptance of data descriptors.
+
+Previously the zip64 flag determined the size of the lengths in the
+data descriptor. This is compliant with the zip format. However, a
+bug in the Java zip library results in an incorrect setting of that
+flag. This commit permits either 32-bit or 64-bit lengths, auto-
+detecting which it is, which works around the Java bug.
+---
+ extract.c | 146 +++++++++++++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 123 insertions(+), 23 deletions(-)
+
+diff --git a/extract.c b/extract.c
+index 878817d..b1c74df 100644
+--- a/extract.c
++++ b/extract.c
+@@ -2173,30 +2173,130 @@ static int extract_or_test_member(__G)    /* return PK-type error code */
+     undefer_input(__G);
+ 
+     if ((G.lrec.general_purpose_bit_flag & 8) != 0) {
+-        /* skip over data descriptor (harder than it sounds, due to signature
+-         * ambiguity)
+-         */
+-#       define SIG 0x08074b50
+-#       define LOW 0xffffffff
+-        uch buf[12];
+-        unsigned shy = 12 - readbuf((char *)buf, 12);
+-        ulg crc = shy ? 0 : makelong(buf);
+-        ulg clen = shy ? 0 : makelong(buf + 4);
+-        ulg ulen = shy ? 0 : makelong(buf + 8); /* or high clen if ZIP64 */
+-        if (crc == SIG &&                       /* if not SIG, no signature */
+-            (G.lrec.crc32 != SIG ||             /* if not SIG, have signature */
+-             (clen == SIG &&                    /* if not SIG, no signature */
+-              ((G.lrec.csize & LOW) != SIG ||   /* if not SIG, have signature */
+-               (ulen == SIG &&                  /* if not SIG, no signature */
+-                (G.pInfo->zip64 ? G.lrec.csize >> 32 : G.lrec.ucsize) != SIG
+-                                                /* if not SIG, have signature */
+-                )))))
+-                   /* skip four more bytes to account for signature */
+-                   shy += 4 - readbuf((char *)buf, 4);
+-        if (G.pInfo->zip64)
+-            shy += 8 - readbuf((char *)buf, 8); /* skip eight more for ZIP64 */
+-        if (shy)
++        // Skip over the data descriptor. We need to correctly position the
++        // read pointer after the data descriptor for the proper detection of
++        // overlapped zip file components.
++        //
++        // We need to resolve an ambiguity over four possible data descriptor
++        // formats. We check for all four, and pick the longest match. The data
++        // descriptor can have a signature or not, and it can use four or
++        // eight-byte lengths. The zip format requires resolving the ambiguity
++        // of a signature or not, but it uses the zip64 flag to determine
++        // whether the lengths are four or eight bytes. However there is a bug
++        // in the Java zip library that applies the wrong value of that flag.
++        // This works around that bug by always trying both length formats.
++        //
++        // So why the longest match? And does this resolve the ambiguity? No,
++        // it doesn't definitively resolve the ambiguity. However choosing the
++        // longest match at least resolves it for a normal zip file, where the
++        // bytes following the data descriptor must be another zip signature
++        // that is not a data descriptor signature. There are a few specific
++        // cases for which more than one of the formats will match the given
++        // CRC and lengths. The most plausible is between four and eight-byte
++        // lengths, either with or without a signature. That only occurs for an
++        // entry with an uncompressed size of zero. We consider the data
++        // descriptor to be a vector of four-byte values. Then the possible
++        // data descriptors are [(s) 0 c 0] and [(s) 0 c 0 0 0], where (s) is
++        // the optional signature, and c is the compressed length. c would be
++        // two for the Deflate compressed data format. These look the same, so
++        // if the file contains [(s) 0 c 0 0 0], then we cannot discriminate
++        // them. However if the data descriptor was intended to be [(s) 0 c 0],
++        // then it has been followed by eight zero bytes in the zip file for
++        // some reason. For a normal zip file this cannot be the case. The data
++        // descriptor would always be immediately followed by another zip file
++        // signature, which is four bytes that are not zeros. The other cases
++        // where more than one format matches are vanishingly unlikely, but the
++        // longest match strategy resolves those as well in a normal zip file.
++        // Those pairs are [s s s] vs. [s s s s], [s s s] vs. [s s s 0 s 0],
++        // and [s s s s s] vs. [s s s s s s]. For all, s is the signature for a
++        // data descriptor. For the first two we have an entry whose CRC,
++        // compressed length, and uncompressed length are all equal (!), and
++        // are all equal to the signature (!!). If this occurs, clearly someone
++        // is messing with us. However the strategy works nonetheless. We see
++        // that if the shorter descriptor, [s s s] were what was intended, then
++        // it has been followed by either four zero bytes or a data descriptor
++        // signature. Neither can occur for a normal zip file, where it must be
++        // followed by a signature that is not a data descriptor signature. So
++        // the longest match is the correct choice. The final case is outright
++        // insane, since the compressed and uncompressed lengths are the data
++        // descriptor signature repeated twice to make a 64-bit length, which
++        // is about 6e17. The largest drive available as I write this is 100TB,
++        // which is one six thousandth of that length. If I apply Moore's law
++        // to drive capacity, we might get to 6e17 about 25 years from now. If
++        // this code is still in use then (I've seen other code I've written in
++        // use for over 30 years), then we're still in luck. A data descriptor
++        // cannot be followed by a data descriptor signature in a normal zip
++        // file. The longest match strategy continues to work.
++        //
++        // So what is a not normal zip file, where these assumptions might fall
++        // apart? zip files have been used in a non-standard way as a poor
++        // substitute for a file system, with entries deleted and perhaps
++        // others replacing them partially, with fragmented zip files being the
++        // result. Then all bets are off as to what might or might not follow a
++        // data descriptor. Though if this sort of data descriptor ambiguity
++        // falls in one of those gaps, then there should be no adverse
++        // consequences for picking the unintended one.
++        int len = 0;
++#       define SIG 0x08074b50           // optional data descriptor signature
++#ifdef LARGE_FILE_SUPPORT
++        uch buf[24];
++        int got = readbuf((char *)buf, sizeof(buf));
++        if (got >= 24 && makelong(buf) == SIG &&
++                         makelong(buf + 4) == G.lrec.crc32 &&
++                         makeint64(buf + 8) == G.lrec.csize &&
++                         makeint64(buf + 16) == G.lrec.ucsize)
++            // Have a data descriptor with a signature and 64-bit lengths.
++            len = 24;
++        else if (got >= 20 && makelong(buf) == G.lrec.crc32 &&
++                              makeint64(buf + 4) == G.lrec.csize &&
++                              makeint64(buf + 12) == G.lrec.ucsize)
++            // Have a data descriptor with no signature and 64-bit lengths.
++            len = 20;
++        else if ((G.lrec.csize >> 32) == 0 && (G.lrec.ucsize >> 32) == 0)
++            // Both lengths are short enough to fit in 32 bits.
++#else
++        uch buf[16];
++        int got = readbuf((char *)buf, sizeof(buf));
++#endif
++        {
++            if (got >= 16 && makelong(buf) == SIG &&
++                             makelong(buf + 4) == G.lrec.crc32 &&
++                             makelong(buf + 8) == G.lrec.csize &&
++                             makelong(buf + 12) == G.lrec.ucsize)
++                // Have a data descriptor with a signature and 32-bit lengths.
++                len = 16;
++            else if (got >= 12 && makelong(buf) == G.lrec.crc32 &&
++                                  makelong(buf + 4) == G.lrec.csize &&
++                                  makelong(buf + 8) == G.lrec.ucsize)
++                // Have a data descriptor with no signature and 32-bit lengths.
++                len = 12;
++        }
++        if (len == 0)
++            // There is no data descriptor that matches the entry CRC and
++            // length values.
+             error = PK_ERR;
++
++        // Back up got-len bytes, to position the read pointer after the data
++        // descriptor. Or to where the data descriptor was supposed to be, in
++        // the event none was found.
++        int back = got - len;
++        if (G.incnt + back > INBUFSIZ) {
++            // Need to load the preceding buffer. We've been here before.
++            G.cur_zipfile_bufstart -= INBUFSIZ;
++#ifdef USE_STRM_INPUT
++            zfseeko(G.zipfd, G.cur_zipfile_bufstart, SEEK_SET);
++#else /* !USE_STRM_INPUT */
++            zlseek(G.zipfd, G.cur_zipfile_bufstart, SEEK_SET);
++#endif /* ?USE_STRM_INPUT */
++            read(G.zipfd, (char *)G.inbuf, INBUFSIZ);
++            G.incnt -= INBUFSIZ - back;
++            G.inptr += INBUFSIZ - back;
++        }
++        else {
++            // Back up within current buffer.
++            G.incnt += back;
++            G.inptr -= back;
++        }
+     }
+ 
+     return error;
diff --git a/main/unzip/zipbomb-switch.patch b/main/unzip/zipbomb-switch.patch
index c6d33c026b9..97c5bf71dee 100644
--- a/main/unzip/zipbomb-switch.patch
+++ b/main/unzip/zipbomb-switch.patch
@@ -129,7 +129,7 @@ index 878817d..3e58071 100644
          }
  #ifdef MACOS  /* MacOS is no preemptive OS, thus call event-handling by hand */
          UserStop();
-@@ -2171,8 +2178,8 @@ static int extract_or_test_member(__G)    /* return PK-type error code */
+@@ -2177,8 +2184,8 @@ static int extract_or_test_member(__G)    /* return PK-type error code */
      }
  
      undefer_input(__G);
@@ -137,26 +137,13 @@ index 878817d..3e58071 100644
 -    if ((G.lrec.general_purpose_bit_flag & 8) != 0) {
 +    if (uO.zipbomb == TRUE) {
 +      if ((G.lrec.general_purpose_bit_flag & 8) != 0) {
-         /* skip over data descriptor (harder than it sounds, due to signature
-          * ambiguity)
-          */
-@@ -2189,16 +2196,16 @@ static int extract_or_test_member(__G)    /* return PK-type error code */
-               ((G.lrec.csize & LOW) != SIG ||   /* if not SIG, have signature */
-                (ulen == SIG &&                  /* if not SIG, no signature */
-                 (G.pInfo->zip64 ? G.lrec.csize >> 32 : G.lrec.ucsize) != SIG
--                                                /* if not SIG, have signature */
-+                /* if not SIG, have signature */
-                 )))))
--                   /* skip four more bytes to account for signature */
--                   shy += 4 - readbuf((char *)buf, 4);
-+          /* skip four more bytes to account for signature */
-+          shy += 4 - readbuf((char *)buf, 4);
-         if (G.pInfo->zip64)
--            shy += 8 - readbuf((char *)buf, 8); /* skip eight more for ZIP64 */
-+          shy += 8 - readbuf((char *)buf, 8); /* skip eight more for ZIP64 */
-         if (shy)
--            error = PK_ERR;
-+          error = PK_ERR;
+         // Skip over the data descriptor. We need to correctly position the
+         // read pointer after the data descriptor for the proper detection of
+         // overlapped zip file components.
+@@ -2303,8 +2310,8 @@ static int extract_or_test_member(__G)    /* return PK-type error code */
+             G.incnt += back;
+             G.inptr -= back;
+         }
 +      }
      }
 -