1 // unzstd -- .warc.zst decompressor
2 // Copyright (C) 2021-2024 Sergey Matveev <stargrave@stargrave.org>
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, version 3 of the License.
8 // This program is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 // GNU General Public License for more details.
13 // You should have received a copy of the GNU General Public License
14 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16 // https://iipc.github.io/warc-specifications/specifications/warc-zstd/
28 #include "capsicum.c.in"
29 #include <capsicum_helpers.h>
35 le32dec(const char buf[4])
37 return (uint32_t)(buf[3]) << 24 | (uint32_t)(buf[2]) << 16 |
38 (uint32_t)(buf[1]) << 8 | (uint32_t)(buf[0]);
42 main(int argc, char **argv)
44 FILE *fdOff = fdopen(3, "wb");
46 if ((fdOff != NULL) && (caph_limit_stream(3, CAPH_WRITE)) != 0) {
47 errx(EX_OSERR, "can not caph_limit_stream(3)");
52 ZSTD_DCtx *ctx = ZSTD_createDCtx();
54 fputs("can not initialize ZSTD_DCtx\n", stderr);
57 int rc = EXIT_FAILURE;
60 const size_t bufInSize = ZSTD_DStreamInSize();
61 bufIn = malloc(bufInSize);
63 fputs("no memory\n", stderr);
66 const size_t bufOutSize = ZSTD_DStreamOutSize();
67 bufOut = malloc(bufOutSize);
69 fputs("no memory\n", stderr);
73 unsigned long long bufSize = 0;
75 ZSTD_inBuffer bIn = {bufIn, 0, 0};
76 ZSTD_outBuffer bOut = {bufOut, 0, 0};
79 bool lastBlock = false;
83 size_t offsetPrev = 0;
87 n = fread(bufIn, 1, bufInSize, stdin);
92 perror("can not fread(FILE)");
96 if (n >= 8 && le32dec(bufIn) == 0x184D2A5D) {
98 size_t dictSize = (size_t)le32dec(bufIn + 4);
99 char *dict = malloc(dictSize);
101 fprintf(stderr, "insufficient memory for dictionary: %zu\n", dictSize);
104 const size_t alreadyRead = n - 8;
105 memcpy(dict, bufIn + 8, alreadyRead);
107 n = fread(dict + alreadyRead, 1, dictSize - alreadyRead, stdin);
108 if (n != dictSize - alreadyRead) {
109 perror("can not read dictionary data");
113 offset = dictSize + 8;
116 fprintf(fdOff, "%zu\t0\n", offset);
118 uint32_t hdr = le32dec(dict);
120 case ZSTD_MAGIC_DICTIONARY:
121 zCode = ZSTD_DCtx_loadDictionary(ctx, dict, dictSize);
123 if ((zCode != 0) && (ZSTD_isError(zCode))) {
126 "can not load dictionary: %s\n",
127 ZSTD_getErrorName(zCode));
131 case ZSTD_MAGICNUMBER:
132 bufSize = ZSTD_getFrameContentSize(dict, dictSize);
134 case ZSTD_CONTENTSIZE_UNKNOWN:
135 case ZSTD_CONTENTSIZE_ERROR:
136 fprintf(stderr, "can not determine dictionary's size\n");
140 char *buf = malloc(bufSize);
143 stderr, "insufficient memory for dictionary: %llu\n", bufSize);
147 zCode = ZSTD_decompress(buf, bufSize, dict, dictSize);
149 if (ZSTD_isError(zCode)) {
152 "can not decompress dictionary: %s\n",
153 ZSTD_getErrorName(zCode));
157 zCode = ZSTD_DCtx_loadDictionary(ctx, buf, zCode);
159 if ((zCode != 0) && (ZSTD_isError(zCode))) {
162 "can not load dictionary: %s\n",
163 ZSTD_getErrorName(zCode));
168 fprintf(stderr, "unknown dictionary header\n");
176 while (bIn.pos < bIn.size) {
177 bOut.size = bufOutSize;
179 zCode = ZSTD_decompressStream(ctx, &bOut, &bIn);
180 if ((zCode != 0) && (ZSTD_isError(zCode))) {
181 fprintf(stderr, "can not decompress: %s\n", ZSTD_getErrorName(zCode));
184 n = fwrite(bufOut, 1, bOut.pos, stdout);
186 perror("can not fwrite(stdout)");
193 fprintf(fdOff, "%zu\t%zu\n", offset - offsetPrev, written);
195 offsetPrev = offset + bIn.pos;
206 fputs("empty input\n", stderr);
210 fprintf(stderr, "unfinished decompression: %s\n", ZSTD_getErrorName(zCode));
216 if (bufOut != NULL) {
223 if ((fdOff != NULL) && (fclose(fdOff) != 0)) {
224 perror("can not fclose(4)");