1 /*
2 * decompress.c: decompression abstraction layer
3 *
4 * Copyright (C) 2007, 2008 Colin Watson.
5 *
6 * This file is part of man-db.
7 *
8 * man-db is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * man-db is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with man-db; if not, write to the Free Software Foundation,
20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #ifdef HAVE_CONFIG_H
24 # include "config.h"
25 #endif /* HAVE_CONFIG_H */
26
27 #include <assert.h>
28 #include <string.h>
29 #include <stdbool.h>
30 #include <stdlib.h>
31 #include <stdio.h>
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <unistd.h>
35 #include <fcntl.h>
36
37 #ifdef HAVE_LIBZ
38 # include "zlib.h"
39 #endif /* HAVE_LIBZ */
40
41 #include "pipeline.h"
42
43 #include "attribute.h"
44 #include "minmax.h"
45 #include "xalloc.h"
46 #include "xstrndup.h"
47 #include "xvasprintf.h"
48
49 #include "manconfig.h"
50
51 #include "compression.h"
52 #include "sandbox.h"
53
54 #include "decompress.h"
55
56 enum decompress_tag {
57 DECOMPRESS_PIPELINE,
58 DECOMPRESS_INPROCESS
59 };
60
61 struct decompress_inprocess {
62 char *buf;
63 size_t len;
64 size_t offset;
65 char *line_cache;
66 };
67
68 struct decompress {
69 enum decompress_tag tag;
70 union {
71 pipeline *p;
72 struct decompress_inprocess inprocess;
73 } u;
74 };
75
76 /* Create a new pipeline-based decompressor. Takes ownership of p. */
77 static decompress *decompress_new_pipeline (pipeline *p)
78 {
79 decompress *d = XMALLOC (decompress);
80
81 d->tag = DECOMPRESS_PIPELINE;
82 d->u.p = p;
83
84 return d;
85 }
86
87 #ifdef HAVE_LIBZ
88
89 /* Create a new in-process decompressor. Takes ownership of buf. */
90 static decompress *decompress_new_inprocess (char *buf, size_t len)
91 {
92 decompress *d = XMALLOC (decompress);
93
94 d->tag = DECOMPRESS_INPROCESS;
95 d->u.inprocess.buf = buf;
96 d->u.inprocess.len = len;
97 d->u.inprocess.offset = 0;
98 d->u.inprocess.line_cache = NULL;
99
100 return d;
101 }
102
103 static void decompress_zlib (void *data MAYBE_UNUSED)
104 {
105 gzFile zlibfile;
106 int fd;
107
108 fd = dup (STDIN_FILENO);
109 if (fd < 0)
110 return;
111
112 zlibfile = gzdopen (fd, "r");
113 if (!zlibfile) {
114 close (fd);
115 return;
116 }
117
118 for (;;) {
119 char buffer[4096];
120 int r = gzread (zlibfile, buffer, 4096);
121 if (r <= 0)
122 break;
123 if (fwrite (buffer, 1, (size_t) r, stdout) < (size_t) r)
124 break;
125 }
126
127 gzclose (zlibfile);
128 return;
129 }
130
131 /* The largest number of uncompressed bytes we're prepared to read into
132 * memory. (We actually allow at most one fewer byte than this, for easy
133 * EOF detection.)
134 *
135 * At the time of writing, 11 out of 27959 (0.04%) installed manual pages on
136 * the author's system were larger than this.
137 *
138 * We could lift this restriction if we streamed in-process decompression
139 * instead, but that's a bit complicated: we'd also need to stream encoding
140 * conversion, and there's relatively little point until lexgrog can rely on
141 * preprocessor header lines rather than having to scan the whole file for
142 * preprocessor indications. For the time being, one-shot buffering is
143 * cheap enough and much simpler.
144 */
145 #define MAX_INPROCESS 1048576
146
147 static decompress *decompress_try_zlib (const char *filename)
148 {
149 gzFile zlibfile;
150 /* We only ever call this from the parent process (and don't
151 * currently use threads), and this lets us skip per-file memory
152 * allocation.
153 */
154 static char buffer[MAX_INPROCESS];
155 int len = 0;
156
157 zlibfile = gzopen (filename, "r");
158 if (!zlibfile)
159 return NULL;
160
161 while (len < MAX_INPROCESS) {
162 /* Read one more byte than we're prepared to return, in
163 * order to detect EOF at the right position. The "len >=
164 * MAX_INPROCESS" check below catches the boundary case.
165 */
166 int r = gzread (zlibfile, buffer + len, MAX_INPROCESS - len);
167 if (r < 0) {
168 gzclose (zlibfile);
169 return NULL;
170 } else if (r == 0)
171 break;
172 else
173 len += r;
174 }
175
176 gzclose (zlibfile);
177 if (len >= MAX_INPROCESS)
178 return NULL;
179 /* Copy input data so that we don't have potential data corruption
180 * if more than one in-process decompressor is active at once. (An
181 * alternative might be to use a lock to prevent that situation.)
182 */
183 return decompress_new_inprocess (xmemdup (buffer, (size_t) len),
184 (size_t) len);
185 }
186
187 #define OPEN_FLAGS_UNUSED
188 #else /* !HAVE_LIBZ */
189 #define OPEN_FLAGS_UNUSED MAYBE_UNUSED
190 #endif /* HAVE_LIBZ */
191
192 extern man_sandbox *sandbox;
193
194 decompress *decompress_open (const char *filename, int flags OPEN_FLAGS_UNUSED)
195 {
196 pipecmd *cmd;
197 pipeline *p;
198 struct stat st;
199 #ifdef HAVE_LIBZ
200 size_t filename_len;
201 #endif /* HAVE_LIBZ */
202 char *ext;
203 struct compression *comp;
204
205 if (stat (filename, &st) < 0 || S_ISDIR (st.st_mode))
206 return NULL;
207
208 #ifdef HAVE_LIBZ
209 filename_len = strlen (filename);
210 if (filename_len > 3 && STREQ (filename + filename_len - 3, ".gz")) {
211 if (flags & DECOMPRESS_ALLOW_INPROCESS) {
212 decompress *d = decompress_try_zlib (filename);
213 if (d)
214 return d;
215 }
216
217 cmd = pipecmd_new_function ("zcat", &decompress_zlib, NULL,
218 NULL);
219 pipecmd_pre_exec (cmd, sandbox_load, sandbox_free, sandbox);
220 p = pipeline_new_commands (cmd, (void *) 0);
221 goto got_pipeline;
222 }
223 #endif /* HAVE_LIBZ */
224
225 ext = strrchr (filename, '.');
226 if (ext) {
227 ++ext;
228
229 for (comp = comp_list; comp->ext; ++comp) {
230 if (!STREQ (comp->ext, ext))
231 continue;
232
233 cmd = pipecmd_new_argstr (comp->prog);
234 pipecmd_pre_exec (cmd, sandbox_load, sandbox_free,
235 sandbox);
236 p = pipeline_new_commands (cmd, (void *) 0);
237 goto got_pipeline;
238 }
239 }
240
241 #ifdef HAVE_GZIP
242 /* HP-UX */
243 ext = strstr (filename, ".Z/");
244 if (ext) {
245 cmd = pipecmd_new_argstr (PROG_GUNZIP);
246 pipecmd_pre_exec (cmd, sandbox_load, sandbox_free, sandbox);
247 p = pipeline_new_commands (cmd, (void *) 0);
248 goto got_pipeline;
249 }
250 #endif
251
252 p = pipeline_new ();
253
254 got_pipeline:
255 pipeline_want_infile (p, filename);
256 pipeline_want_out (p, -1);
257 return decompress_new_pipeline (p);
258 }
259
260 decompress *decompress_fdopen (int fd)
261 {
262 pipeline *p;
263 #ifdef HAVE_LIBZ
264 pipecmd *cmd;
265 #endif /* HAVE_LIBZ */
266
267 #ifdef HAVE_LIBZ
268 cmd = pipecmd_new_function ("zcat", &decompress_zlib, NULL, NULL);
269 pipecmd_pre_exec (cmd, sandbox_load, sandbox_free, sandbox);
270 p = pipeline_new_commands (cmd, (void *) 0);
271 #else /* HAVE_LIBZ */
272 p = pipeline_new ();
273 #endif /* HAVE_LIBZ */
274
275 pipeline_want_in (p, fd);
276 pipeline_want_out (p, -1);
277 return decompress_new_pipeline (p);
278 }
279
280 bool ATTRIBUTE_PURE decompress_is_pipeline (decompress *d)
281 {
282 return d->tag == DECOMPRESS_PIPELINE;
283 }
284
285 pipeline * ATTRIBUTE_PURE decompress_get_pipeline (decompress *d)
286 {
287 assert (d->tag == DECOMPRESS_PIPELINE);
288 return d->u.p;
289 }
290
291 const char * ATTRIBUTE_PURE decompress_inprocess_buf (decompress *d)
292 {
293 assert (d->tag == DECOMPRESS_INPROCESS);
294 return d->u.inprocess.buf;
295 }
296
297 size_t ATTRIBUTE_PURE decompress_inprocess_len (decompress *d)
298 {
299 assert (d->tag == DECOMPRESS_INPROCESS);
300 return d->u.inprocess.len;
301 }
302
303 void decompress_inprocess_replace (decompress *d, char *buf, size_t len)
304 {
305 assert (d->tag == DECOMPRESS_INPROCESS);
306
307 free (d->u.inprocess.line_cache);
308 free (d->u.inprocess.buf);
309
310 d->u.inprocess.buf = buf;
311 d->u.inprocess.len = len;
312 d->u.inprocess.offset = 0;
313 d->u.inprocess.line_cache = NULL;
314 }
315
316 void decompress_start (decompress *d)
317 {
318 if (d->tag == DECOMPRESS_PIPELINE)
319 pipeline_start (d->u.p);
320 }
321
322 const char *decompress_read (decompress *d, size_t *len)
323 {
324 if (d->tag == DECOMPRESS_PIPELINE)
325 return pipeline_read (d->u.p, len);
326 else {
327 const char *ret;
328 assert (d->tag == DECOMPRESS_INPROCESS);
329 *len = MIN (*len, d->u.inprocess.len - d->u.inprocess.offset);
330 ret = d->u.inprocess.buf + d->u.inprocess.offset;
331 d->u.inprocess.offset += *len;
332 return ret;
333 }
334 }
335
336 const char *decompress_peek (decompress *d, size_t *len)
337 {
338 if (d->tag == DECOMPRESS_PIPELINE)
339 return pipeline_peek (d->u.p, len);
340 else {
341 assert (d->tag == DECOMPRESS_INPROCESS);
342 *len = MIN (*len, d->u.inprocess.len - d->u.inprocess.offset);
343 return d->u.inprocess.buf + d->u.inprocess.offset;
344 }
345 }
346
347 void decompress_peek_skip (decompress *d, size_t len)
348 {
349 if (d->tag == DECOMPRESS_PIPELINE)
350 pipeline_peek_skip (d->u.p, len);
351 else {
352 assert (d->tag == DECOMPRESS_INPROCESS);
353 assert (len <= d->u.inprocess.len - d->u.inprocess.offset);
354 d->u.inprocess.offset += len;
355 }
356 }
357
358 const char *decompress_readline (decompress *d)
359 {
360 if (d->tag == DECOMPRESS_PIPELINE)
361 return pipeline_readline (d->u.p);
362 else {
363 const char *cur, *end;
364 assert (d->tag == DECOMPRESS_INPROCESS);
365 /* This isn't on the hot path (only called for a few lines
366 * at the start of the file), so we can afford to
367 * reallocate.
368 */
369 if (d->u.inprocess.line_cache) {
370 free (d->u.inprocess.line_cache);
371 d->u.inprocess.line_cache = NULL;
372 }
373 cur = d->u.inprocess.buf + d->u.inprocess.offset;
374 end = memchr (cur, '\n',
375 d->u.inprocess.len - d->u.inprocess.offset);
376 if (end) {
377 d->u.inprocess.line_cache = xstrndup
378 (cur, end - cur + 1);
379 d->u.inprocess.offset += end - cur + 1;
380 return d->u.inprocess.line_cache;
381 } else
382 return NULL;
383 }
384 }
385
386 const char *decompress_peekline (decompress *d)
387 {
388 if (d->tag == DECOMPRESS_PIPELINE)
389 return pipeline_peekline (d->u.p);
390 else {
391 const char *cur, *end;
392 assert (d->tag == DECOMPRESS_INPROCESS);
393 /* This isn't on the hot path (only called for a few lines
394 * at the start of the file), so we can afford to
395 * reallocate.
396 */
397 if (d->u.inprocess.line_cache) {
398 free (d->u.inprocess.line_cache);
399 d->u.inprocess.line_cache = NULL;
400 }
401 cur = d->u.inprocess.buf + d->u.inprocess.offset;
402 end = memchr (cur, '\n',
403 d->u.inprocess.len - d->u.inprocess.offset);
404 if (end) {
405 d->u.inprocess.line_cache = xstrndup
406 (cur, end - cur + 1);
407 return d->u.inprocess.line_cache;
408 } else
409 return NULL;
410 }
411 }
412
413 int decompress_wait (decompress *d)
414 {
415 if (d->tag == DECOMPRESS_PIPELINE)
416 return pipeline_wait (d->u.p);
417 else {
418 assert (d->tag == DECOMPRESS_INPROCESS);
419 return 0;
420 }
421 }
422
423 void decompress_free (decompress *d)
424 {
425 if (!d)
426 return;
427 if (d->tag == DECOMPRESS_PIPELINE)
428 pipeline_free (d->u.p);
429 else {
430 assert (d->tag == DECOMPRESS_INPROCESS);
431 free (d->u.inprocess.line_cache);
432 free (d->u.inprocess.buf);
433 }
434 free (d);
435 }