--- /dev/null
+/*
+unzstd -- .warc.zst decompressor
+Copyright (C) 2021 Sergey Matveev <stargrave@stargrave.org>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+// https://iipc.github.io/warc-specifications/specifications/warc-zstd/
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/endian.h>
+
+#include <zstd.h>
+
+int
+main(int argc, char **argv)
+{
+ ZSTD_DCtx *ctx = ZSTD_createDCtx();
+ if (ctx == NULL) {
+ fputs("can not initialize ZSTD_DCtx\n", stderr);
+ return 1;
+ };
+ FILE *fdOff = fdopen(3, "wb");
+ int rc = EXIT_FAILURE;
+ uint8_t *bufIn = NULL;
+ uint8_t *bufOut = NULL;
+ const size_t bufInSize = ZSTD_DStreamInSize();
+ bufIn = malloc(bufInSize);
+ if (bufIn == NULL) {
+ fputs("no memory\n", stderr);
+ goto Exit;
+ };
+ const size_t bufOutSize = ZSTD_DStreamOutSize();
+ bufOut = malloc(bufOutSize);
+ if (bufOut == NULL) {
+ fputs("no memory\n", stderr);
+ goto Exit;
+ };
+
+ unsigned long long bufSize = 0;
+
+ ZSTD_inBuffer bIn = {bufIn, 0, 0};
+ ZSTD_outBuffer bOut = {bufOut, 0, 0};
+
+ bool isEmpty = true;
+ bool lastBlock = false;
+ size_t n = 0;
+ size_t written = 0;
+ size_t offset = 0;
+ size_t offsetPrev = 0;
+ size_t zCode = 0;
+ReadAgain:
+ for (;;) {
+ n = fread(bufIn, 1, bufInSize, stdin);
+ if (n != bufInSize) {
+ if (feof(stdin)) {
+ lastBlock = true;
+ } else {
+ perror("can not fread(FILE)");
+ goto Exit;
+ };
+ };
+ if (n >= 8 && le32dec(bufIn) == 0x184D2A5D) {
+ // dictionary frame
+ size_t dictSize = (size_t)le32dec(bufIn + 4);
+ uint8_t *dict = malloc(dictSize);
+ if (dict == NULL) {
+ fprintf(stderr, "insufficient memory for dictionary: %zu\n", dictSize);
+ goto Exit;
+ };
+ const size_t alreadyRead = n - 8;
+ memcpy(dict, bufIn + 8, alreadyRead);
+ errno = 0;
+ n = fread(dict + alreadyRead, 1, dictSize - alreadyRead, stdin);
+ if (n != dictSize - alreadyRead) {
+ perror("can not read dictionary data");
+ free(dict);
+ goto Exit;
+ };
+ offset = dictSize + 8;
+ offsetPrev = offset;
+ if (fdOff != NULL) {
+ fprintf(fdOff, "%zu\t0\n", offset);
+ };
+ uint32_t hdr = le32dec(dict);
+ switch (hdr) {
+ case ZSTD_MAGIC_DICTIONARY:
+ zCode = ZSTD_DCtx_loadDictionary(ctx, dict, dictSize);
+ free(dict);
+ if ((zCode != 0) && (ZSTD_isError(zCode))) {
+ fprintf(
+ stderr,
+ "can not load dictionary: %s\n",
+ ZSTD_getErrorName(zCode));
+ goto Exit;
+ };
+ goto ReadAgain;
+ break;
+ case ZSTD_MAGICNUMBER:
+ bufSize = ZSTD_getFrameContentSize(dict, dictSize);
+ switch (bufSize) {
+ case ZSTD_CONTENTSIZE_UNKNOWN:
+ case ZSTD_CONTENTSIZE_ERROR:
+ fprintf(stderr, "can not determine dictionary's size\n");
+ free(dict);
+ goto Exit;
+ };
+ uint8_t *buf = malloc(bufSize);
+ if (buf == NULL) {
+ fprintf(
+ stderr, "insufficient memory for dictionary: %llu\n", bufSize);
+ free(dict);
+ goto Exit;
+ };
+ zCode = ZSTD_decompress(buf, bufSize, dict, dictSize);
+ free(dict);
+ if (ZSTD_isError(zCode)) {
+ fprintf(
+ stderr,
+ "can not decompress dictionary: %s\n",
+ ZSTD_getErrorName(zCode));
+ free(buf);
+ goto Exit;
+ };
+ zCode = ZSTD_DCtx_loadDictionary(ctx, buf, zCode);
+ free(buf);
+ if ((zCode != 0) && (ZSTD_isError(zCode))) {
+ fprintf(
+ stderr,
+ "can not load dictionary: %s\n",
+ ZSTD_getErrorName(zCode));
+ goto Exit;
+ };
+ goto ReadAgain;
+ break;
+ default:
+ fprintf(stderr, "unknown dictionary header\n");
+ free(dict);
+ goto Exit;
+ };
+ };
+ isEmpty = false;
+ bIn.size = n;
+ bIn.pos = 0;
+ while (bIn.pos < bIn.size) {
+ bOut.size = bufOutSize;
+ bOut.pos = 0;
+ zCode = ZSTD_decompressStream(ctx, &bOut, &bIn);
+ if ((zCode != 0) && (ZSTD_isError(zCode))) {
+ fprintf(stderr, "can not decompress: %s\n", ZSTD_getErrorName(zCode));
+ goto Exit;
+ };
+ n = fwrite(bufOut, 1, bOut.pos, stdout);
+ if (n != bOut.pos) {
+ perror("can not fwrite(stdout)");
+ goto Exit;
+ };
+ written += n;
+ if (zCode == 0) {
+ offset += bIn.pos;
+ if (fdOff != NULL) {
+ fprintf(fdOff, "%zu\t%zu\n", offset - offsetPrev, written);
+ };
+ offsetPrev = offset + bIn.pos;
+ written = 0;
+ };
+ };
+ if (lastBlock) {
+ break;
+ };
+ offset += bIn.pos;
+ };
+
+ if (isEmpty) {
+ fputs("empty input\n", stderr);
+ goto Exit;
+ };
+ if (zCode != 0) {
+ fprintf(stderr, "unfinished decompression: %s\n", ZSTD_getErrorName(zCode));
+ goto Exit;
+ };
+ rc = EXIT_SUCCESS;
+
+Exit:
+ if (bufOut != NULL) {
+ free(bufOut);
+ };
+ if (bufIn != NULL) {
+ free(bufIn);
+ };
+ ZSTD_freeDCtx(ctx);
+ if ((fdOff != NULL) && (fclose(fdOff) != 0)) {
+ perror("can not fclose(4)");
+ return EXIT_FAILURE;
+ };
+ return rc;
+};