// unzstd -- .warc.zst decompressor // Copyright (C) 2021-2024 Sergey Matveev // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 3 of the License. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see . // https://iipc.github.io/warc-specifications/specifications/warc-zstd/ #include #include #include #include #include #include #include #ifdef __FreeBSD__ #include "capsicum.c.in" #include #include #include #endif // __FreeBSD__ static uint32_t le32dec(const char buf[4]) { return (uint32_t)(buf[3]) << 24 | (uint32_t)(buf[2]) << 16 | (uint32_t)(buf[1]) << 8 | (uint32_t)(buf[0]); } int main(int argc, char **argv) { FILE *fdOff = fdopen(3, "wb"); #ifdef __FreeBSD__ if ((fdOff != NULL) && (caph_limit_stream(3, CAPH_WRITE)) != 0) { errx(EX_OSERR, "can not caph_limit_stream(3)"); } capsicum_start(); #endif // __FreeBSD__ ZSTD_DCtx *ctx = ZSTD_createDCtx(); if (ctx == NULL) { fputs("can not initialize ZSTD_DCtx\n", stderr); return 1; } int rc = EXIT_FAILURE; char *bufIn = NULL; char *bufOut = NULL; const size_t bufInSize = ZSTD_DStreamInSize(); bufIn = malloc(bufInSize); if (bufIn == NULL) { fputs("no memory\n", stderr); goto Exit; } const size_t bufOutSize = ZSTD_DStreamOutSize(); bufOut = malloc(bufOutSize); if (bufOut == NULL) { fputs("no memory\n", stderr); goto Exit; } unsigned long long bufSize = 0; ZSTD_inBuffer bIn = {bufIn, 0, 0}; ZSTD_outBuffer bOut = {bufOut, 0, 0}; bool isEmpty = true; bool lastBlock = false; size_t n = 0; size_t written = 0; size_t offset = 0; size_t offsetPrev = 0; size_t zCode = 0; ReadAgain: for (;;) { n = fread(bufIn, 1, bufInSize, stdin); if (n != bufInSize) { if (feof(stdin)) { lastBlock = true; } else { perror("can not fread(FILE)"); goto Exit; } } if (n >= 8 && le32dec(bufIn) == 0x184D2A5D) { // dictionary frame size_t dictSize = (size_t)le32dec(bufIn + 4); char *dict = malloc(dictSize); if (dict == NULL) { fprintf(stderr, "insufficient memory for dictionary: %zu\n", dictSize); goto Exit; } const size_t alreadyRead = n - 8; memcpy(dict, bufIn + 8, alreadyRead); errno = 0; n = fread(dict + alreadyRead, 1, dictSize - alreadyRead, stdin); if (n != dictSize - alreadyRead) { perror("can not read dictionary data"); free(dict); goto Exit; } offset = dictSize + 8; offsetPrev = offset; if (fdOff != NULL) { fprintf(fdOff, "%zu\t0\n", offset); } uint32_t hdr = le32dec(dict); switch (hdr) { case ZSTD_MAGIC_DICTIONARY: zCode = ZSTD_DCtx_loadDictionary(ctx, dict, dictSize); free(dict); if ((zCode != 0) && (ZSTD_isError(zCode))) { fprintf( stderr, "can not load dictionary: %s\n", ZSTD_getErrorName(zCode)); goto Exit; } goto ReadAgain; case ZSTD_MAGICNUMBER: bufSize = ZSTD_getFrameContentSize(dict, dictSize); switch (bufSize) { case ZSTD_CONTENTSIZE_UNKNOWN: case ZSTD_CONTENTSIZE_ERROR: fprintf(stderr, "can not determine dictionary's size\n"); free(dict); goto Exit; } char *buf = malloc(bufSize); if (buf == NULL) { fprintf( stderr, "insufficient memory for dictionary: %llu\n", bufSize); free(dict); goto Exit; } zCode = ZSTD_decompress(buf, bufSize, dict, dictSize); free(dict); if (ZSTD_isError(zCode)) { fprintf( stderr, "can not decompress dictionary: %s\n", ZSTD_getErrorName(zCode)); free(buf); goto Exit; } zCode = ZSTD_DCtx_loadDictionary(ctx, buf, zCode); free(buf); if ((zCode != 0) && (ZSTD_isError(zCode))) { fprintf( stderr, "can not load dictionary: %s\n", ZSTD_getErrorName(zCode)); goto Exit; } goto ReadAgain; default: fprintf(stderr, "unknown dictionary header\n"); free(dict); goto Exit; } } isEmpty = false; bIn.size = n; bIn.pos = 0; while (bIn.pos < bIn.size) { bOut.size = bufOutSize; bOut.pos = 0; zCode = ZSTD_decompressStream(ctx, &bOut, &bIn); if ((zCode != 0) && (ZSTD_isError(zCode))) { fprintf(stderr, "can not decompress: %s\n", ZSTD_getErrorName(zCode)); goto Exit; } n = fwrite(bufOut, 1, bOut.pos, stdout); if (n != bOut.pos) { perror("can not fwrite(stdout)"); goto Exit; } written += n; if (zCode == 0) { offset += bIn.pos; if (fdOff != NULL) { fprintf(fdOff, "%zu\t%zu\n", offset - offsetPrev, written); } offsetPrev = offset + bIn.pos; written = 0; } } if (lastBlock) { break; } offset += bIn.pos; } if (isEmpty) { fputs("empty input\n", stderr); goto Exit; } if (zCode != 0) { fprintf(stderr, "unfinished decompression: %s\n", ZSTD_getErrorName(zCode)); goto Exit; } rc = EXIT_SUCCESS; Exit: if (bufOut != NULL) { free(bufOut); } if (bufIn != NULL) { free(bufIn); } ZSTD_freeDCtx(ctx); if ((fdOff != NULL) && (fclose(fdOff) != 0)) { perror("can not fclose(4)"); return EXIT_FAILURE; } return rc; }