]> Sergey Matveev's repositories - tofuproxy.git/blobdiff - cmd/zstd/unzstd.c
Refactor C-code building, use pkgconf
[tofuproxy.git] / cmd / zstd / unzstd.c
diff --git a/cmd/zstd/unzstd.c b/cmd/zstd/unzstd.c
new file mode 100644 (file)
index 0000000..ada11ea
--- /dev/null
@@ -0,0 +1,212 @@
+/*
+unzstd -- .warc.zst decompressor
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// https://iipc.github.io/warc-specifications/specifications/warc-zstd/
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/endian.h>
+
+#include <zstd.h>
+
+int
+main(int argc, char **argv)
+{
+    ZSTD_DCtx *ctx = ZSTD_createDCtx();
+    if (ctx == NULL) {
+        fputs("can not initialize ZSTD_DCtx\n", stderr);
+        return 1;
+    };
+    FILE *fdOff            = fdopen(3, "wb");
+    int rc                 = EXIT_FAILURE;
+    uint8_t *bufIn         = NULL;
+    uint8_t *bufOut        = NULL;
+    const size_t bufInSize = ZSTD_DStreamInSize();
+    bufIn                  = malloc(bufInSize);
+    if (bufIn == NULL) {
+        fputs("no memory\n", stderr);
+        goto Exit;
+    };
+    const size_t bufOutSize = ZSTD_DStreamOutSize();
+    bufOut                  = malloc(bufOutSize);
+    if (bufOut == NULL) {
+        fputs("no memory\n", stderr);
+        goto Exit;
+    };
+
+    unsigned long long bufSize = 0;
+
+    ZSTD_inBuffer bIn   = {bufIn, 0, 0};
+    ZSTD_outBuffer bOut = {bufOut, 0, 0};
+
+    bool isEmpty      = true;
+    bool lastBlock    = false;
+    size_t n          = 0;
+    size_t written    = 0;
+    size_t offset     = 0;
+    size_t offsetPrev = 0;
+    size_t zCode      = 0;
+ReadAgain:
+    for (;;) {
+        n = fread(bufIn, 1, bufInSize, stdin);
+        if (n != bufInSize) {
+            if (feof(stdin)) {
+                lastBlock = true;
+            } else {
+                perror("can not fread(FILE)");
+                goto Exit;
+            };
+        };
+        if (n >= 8 && le32dec(bufIn) == 0x184D2A5D) {
+            // dictionary frame
+            size_t dictSize = (size_t)le32dec(bufIn + 4);
+            uint8_t *dict   = malloc(dictSize);
+            if (dict == NULL) {
+                fprintf(stderr, "insufficient memory for dictionary: %zu\n", dictSize);
+                goto Exit;
+            };
+            const size_t alreadyRead = n - 8;
+            memcpy(dict, bufIn + 8, alreadyRead);
+            errno = 0;
+            n     = fread(dict + alreadyRead, 1, dictSize - alreadyRead, stdin);
+            if (n != dictSize - alreadyRead) {
+                perror("can not read dictionary data");
+                free(dict);
+                goto Exit;
+            };
+            offset     = dictSize + 8;
+            offsetPrev = offset;
+            if (fdOff != NULL) {
+                fprintf(fdOff, "%zu\t0\n", offset);
+            };
+            uint32_t hdr = le32dec(dict);
+            switch (hdr) {
+            case ZSTD_MAGIC_DICTIONARY:
+                zCode = ZSTD_DCtx_loadDictionary(ctx, dict, dictSize);
+                free(dict);
+                if ((zCode != 0) && (ZSTD_isError(zCode))) {
+                    fprintf(
+                        stderr,
+                        "can not load dictionary: %s\n",
+                        ZSTD_getErrorName(zCode));
+                    goto Exit;
+                };
+                goto ReadAgain;
+                break;
+            case ZSTD_MAGICNUMBER:
+                bufSize = ZSTD_getFrameContentSize(dict, dictSize);
+                switch (bufSize) {
+                case ZSTD_CONTENTSIZE_UNKNOWN:
+                case ZSTD_CONTENTSIZE_ERROR:
+                    fprintf(stderr, "can not determine dictionary's size\n");
+                    free(dict);
+                    goto Exit;
+                };
+                uint8_t *buf = malloc(bufSize);
+                if (buf == NULL) {
+                    fprintf(
+                        stderr, "insufficient memory for dictionary: %llu\n", bufSize);
+                    free(dict);
+                    goto Exit;
+                };
+                zCode = ZSTD_decompress(buf, bufSize, dict, dictSize);
+                free(dict);
+                if (ZSTD_isError(zCode)) {
+                    fprintf(
+                        stderr,
+                        "can not decompress dictionary: %s\n",
+                        ZSTD_getErrorName(zCode));
+                    free(buf);
+                    goto Exit;
+                };
+                zCode = ZSTD_DCtx_loadDictionary(ctx, buf, zCode);
+                free(buf);
+                if ((zCode != 0) && (ZSTD_isError(zCode))) {
+                    fprintf(
+                        stderr,
+                        "can not load dictionary: %s\n",
+                        ZSTD_getErrorName(zCode));
+                    goto Exit;
+                };
+                goto ReadAgain;
+                break;
+            default:
+                fprintf(stderr, "unknown dictionary header\n");
+                free(dict);
+                goto Exit;
+            };
+        };
+        isEmpty  = false;
+        bIn.size = n;
+        bIn.pos  = 0;
+        while (bIn.pos < bIn.size) {
+            bOut.size = bufOutSize;
+            bOut.pos  = 0;
+            zCode     = ZSTD_decompressStream(ctx, &bOut, &bIn);
+            if ((zCode != 0) && (ZSTD_isError(zCode))) {
+                fprintf(stderr, "can not decompress: %s\n", ZSTD_getErrorName(zCode));
+                goto Exit;
+            };
+            n = fwrite(bufOut, 1, bOut.pos, stdout);
+            if (n != bOut.pos) {
+                perror("can not fwrite(stdout)");
+                goto Exit;
+            };
+            written += n;
+            if (zCode == 0) {
+                offset += bIn.pos;
+                if (fdOff != NULL) {
+                    fprintf(fdOff, "%zu\t%zu\n", offset - offsetPrev, written);
+                };
+                offsetPrev = offset + bIn.pos;
+                written    = 0;
+            };
+        };
+        if (lastBlock) {
+            break;
+        };
+        offset += bIn.pos;
+    };
+
+    if (isEmpty) {
+        fputs("empty input\n", stderr);
+        goto Exit;
+    };
+    if (zCode != 0) {
+        fprintf(stderr, "unfinished decompression: %s\n", ZSTD_getErrorName(zCode));
+        goto Exit;
+    };
+    rc = EXIT_SUCCESS;
+
+Exit:
+    if (bufOut != NULL) {
+        free(bufOut);
+    };
+    if (bufIn != NULL) {
+        free(bufIn);
+    };
+    ZSTD_freeDCtx(ctx);
+    if ((fdOff != NULL) && (fclose(fdOff) != 0)) {
+        perror("can not fclose(4)");
+        return EXIT_FAILURE;
+    };
+    return rc;
+};