1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 /// \file coder.c
4 /// \brief Compresses or uncompresses a file
5 //
6 // Author: Lasse Collin
7 //
8 // This file has been put into the public domain.
9 // You can do whatever you want with this file.
10 //
11 ///////////////////////////////////////////////////////////////////////////////
12
13 #include "private.h"
14
15
16 /// Return value type for coder_init().
17 enum coder_init_ret {
18 CODER_INIT_NORMAL,
19 CODER_INIT_PASSTHRU,
20 CODER_INIT_ERROR,
21 };
22
23
24 enum operation_mode opt_mode = MODE_COMPRESS;
25 enum format_type opt_format = FORMAT_AUTO;
26 bool opt_auto_adjust = true;
27 bool opt_single_stream = false;
28 uint64_t opt_block_size = 0;
29 uint64_t *opt_block_list = NULL;
30
31
32 /// Stream used to communicate with liblzma
33 static lzma_stream strm = LZMA_STREAM_INIT;
34
35 /// Filters needed for all encoding all formats, and also decoding in raw data
36 static lzma_filter filters[LZMA_FILTERS_MAX + 1];
37
38 /// Input and output buffers
39 static io_buf in_buf;
40 static io_buf out_buf;
41
42 /// Number of filters. Zero indicates that we are using a preset.
43 static uint32_t filters_count = 0;
44
45 /// Number of the preset (0-9)
46 static uint32_t preset_number = LZMA_PRESET_DEFAULT;
47
48 /// Integrity check type
49 static lzma_check check;
50
51 /// This becomes false if the --check=CHECK option is used.
52 static bool check_default = true;
53
54 /// Indicates if unconsumed input is allowed to remain after
55 /// decoding has successfully finished. This is set for each file
56 /// in coder_init().
57 static bool allow_trailing_input;
58
59 #ifdef MYTHREAD_ENABLED
60 static lzma_mt mt_options = {
61 .flags = 0,
62 .timeout = 300,
63 .filters = filters,
64 };
65 #endif
66
67
68 extern void
69 coder_set_check(lzma_check new_check)
70 {
71 check = new_check;
72 check_default = false;
73 return;
74 }
75
76
77 static void
78 forget_filter_chain(void)
79 {
80 // Setting a preset makes us forget a possibly defined custom
81 // filter chain.
82 while (filters_count > 0) {
83 --filters_count;
84 free(filters[filters_count].options);
85 filters[filters_count].options = NULL;
86 }
87
88 return;
89 }
90
91
92 extern void
93 coder_set_preset(uint32_t new_preset)
94 {
95 preset_number &= ~LZMA_PRESET_LEVEL_MASK;
96 preset_number |= new_preset;
97 forget_filter_chain();
98 return;
99 }
100
101
102 extern void
103 coder_set_extreme(void)
104 {
105 preset_number |= LZMA_PRESET_EXTREME;
106 forget_filter_chain();
107 return;
108 }
109
110
111 extern void
112 coder_add_filter(lzma_vli id, void *options)
113 {
114 if (filters_count == LZMA_FILTERS_MAX)
115 message_fatal(_("Maximum number of filters is four"));
116
117 filters[filters_count].id = id;
118 filters[filters_count].options = options;
119 ++filters_count;
120
121 // Setting a custom filter chain makes us forget the preset options.
122 // This makes a difference if one specifies e.g. "xz -9 --lzma2 -e"
123 // where the custom filter chain resets the preset level back to
124 // the default 6, making the example equivalent to "xz -6e".
125 preset_number = LZMA_PRESET_DEFAULT;
126
127 return;
128 }
129
130
131 tuklib_attr_noreturn
132 static void
133 memlimit_too_small(uint64_t memory_usage)
134 {
135 message(V_ERROR, _("Memory usage limit is too low for the given "
136 "filter setup."));
137 message_mem_needed(V_ERROR, memory_usage);
138 tuklib_exit(E_ERROR, E_ERROR, false);
139 }
140
141
142 extern void
143 coder_set_compression_settings(void)
144 {
145 #ifdef HAVE_LZIP_DECODER
146 // .lz compression isn't supported.
147 assert(opt_format != FORMAT_LZIP);
148 #endif
149
150 // The default check type is CRC64, but fallback to CRC32
151 // if CRC64 isn't supported by the copy of liblzma we are
152 // using. CRC32 is always supported.
153 if (check_default) {
154 check = LZMA_CHECK_CRC64;
155 if (!lzma_check_is_supported(check))
156 check = LZMA_CHECK_CRC32;
157 }
158
159 // Options for LZMA1 or LZMA2 in case we are using a preset.
160 static lzma_options_lzma opt_lzma;
161
162 if (filters_count == 0) {
163 // We are using a preset. This is not a good idea in raw mode
164 // except when playing around with things. Different versions
165 // of this software may use different options in presets, and
166 // thus make uncompressing the raw data difficult.
167 if (opt_format == FORMAT_RAW) {
168 // The message is shown only if warnings are allowed
169 // but the exit status isn't changed.
170 message(V_WARNING, _("Using a preset in raw mode "
171 "is discouraged."));
172 message(V_WARNING, _("The exact options of the "
173 "presets may vary between software "
174 "versions."));
175 }
176
177 // Get the preset for LZMA1 or LZMA2.
178 if (lzma_lzma_preset(&opt_lzma, preset_number))
179 message_bug();
180
181 // Use LZMA2 except with --format=lzma we use LZMA1.
182 filters[0].id = opt_format == FORMAT_LZMA
183 ? LZMA_FILTER_LZMA1 : LZMA_FILTER_LZMA2;
184 filters[0].options = &opt_lzma;
185 filters_count = 1;
186 }
187
188 // Terminate the filter options array.
189 filters[filters_count].id = LZMA_VLI_UNKNOWN;
190
191 // If we are using the .lzma format, allow exactly one filter
192 // which has to be LZMA1.
193 if (opt_format == FORMAT_LZMA && (filters_count != 1
194 || filters[0].id != LZMA_FILTER_LZMA1))
195 message_fatal(_("The .lzma format supports only "
196 "the LZMA1 filter"));
197
198 // If we are using the .xz format, make sure that there is no LZMA1
199 // filter to prevent LZMA_PROG_ERROR.
200 if (opt_format == FORMAT_XZ)
201 for (size_t i = 0; i < filters_count; ++i)
202 if (filters[i].id == LZMA_FILTER_LZMA1)
203 message_fatal(_("LZMA1 cannot be used "
204 "with the .xz format"));
205
206 // Print the selected filter chain.
207 message_filters_show(V_DEBUG, filters);
208
209 // The --flush-timeout option requires LZMA_SYNC_FLUSH support
210 // from the filter chain. Currently threaded encoder doesn't support
211 // LZMA_SYNC_FLUSH so single-threaded mode must be used.
212 if (opt_mode == MODE_COMPRESS && opt_flush_timeout != 0) {
213 for (size_t i = 0; i < filters_count; ++i) {
214 switch (filters[i].id) {
215 case LZMA_FILTER_LZMA2:
216 case LZMA_FILTER_DELTA:
217 break;
218
219 default:
220 message_fatal(_("The filter chain is "
221 "incompatible with --flush-timeout"));
222 }
223 }
224
225 if (hardware_threads_is_mt()) {
226 message(V_WARNING, _("Switching to single-threaded "
227 "mode due to --flush-timeout"));
228 hardware_threads_set(1);
229 }
230 }
231
232 // Get the memory usage. Note that if --format=raw was used,
233 // we can be decompressing.
234 //
235 // If multithreaded .xz compression is done, this value will be
236 // replaced.
237 uint64_t memory_limit = hardware_memlimit_get(opt_mode);
238 uint64_t memory_usage = UINT64_MAX;
239 if (opt_mode == MODE_COMPRESS) {
240 #ifdef HAVE_ENCODERS
241 # ifdef MYTHREAD_ENABLED
242 if (opt_format == FORMAT_XZ && hardware_threads_is_mt()) {
243 memory_limit = hardware_memlimit_mtenc_get();
244 mt_options.threads = hardware_threads_get();
245 mt_options.block_size = opt_block_size;
246 mt_options.check = check;
247 memory_usage = lzma_stream_encoder_mt_memusage(
248 &mt_options);
249 if (memory_usage != UINT64_MAX)
250 message(V_DEBUG, _("Using up to %" PRIu32
251 " threads."),
252 mt_options.threads);
253 } else
254 # endif
255 {
256 memory_usage = lzma_raw_encoder_memusage(filters);
257 }
258 #endif
259 } else {
260 #ifdef HAVE_DECODERS
261 memory_usage = lzma_raw_decoder_memusage(filters);
262 #endif
263 }
264
265 if (memory_usage == UINT64_MAX)
266 message_fatal(_("Unsupported filter chain or filter options"));
267
268 // Print memory usage info before possible dictionary
269 // size auto-adjusting.
270 //
271 // NOTE: If only encoder support was built, we cannot show the
272 // what the decoder memory usage will be.
273 message_mem_needed(V_DEBUG, memory_usage);
274 #ifdef HAVE_DECODERS
275 if (opt_mode == MODE_COMPRESS) {
276 const uint64_t decmem = lzma_raw_decoder_memusage(filters);
277 if (decmem != UINT64_MAX)
278 message(V_DEBUG, _("Decompression will need "
279 "%s MiB of memory."), uint64_to_str(
280 round_up_to_mib(decmem), 0));
281 }
282 #endif
283
284 if (memory_usage <= memory_limit)
285 return;
286
287 // With --format=raw settings are never adjusted to meet
288 // the memory usage limit.
289 if (opt_format == FORMAT_RAW)
290 memlimit_too_small(memory_usage);
291
292 assert(opt_mode == MODE_COMPRESS);
293
294 #ifdef HAVE_ENCODERS
295 # ifdef MYTHREAD_ENABLED
296 if (opt_format == FORMAT_XZ && hardware_threads_is_mt()) {
297 // Try to reduce the number of threads before
298 // adjusting the compression settings down.
299 while (mt_options.threads > 1) {
300 // Reduce the number of threads by one and check
301 // the memory usage.
302 --mt_options.threads;
303 memory_usage = lzma_stream_encoder_mt_memusage(
304 &mt_options);
305 if (memory_usage == UINT64_MAX)
306 message_bug();
307
308 if (memory_usage <= memory_limit) {
309 // The memory usage is now low enough.
310 message(V_WARNING, _("Reduced the number of "
311 "threads from %s to %s to not exceed "
312 "the memory usage limit of %s MiB"),
313 uint64_to_str(
314 hardware_threads_get(), 0),
315 uint64_to_str(mt_options.threads, 1),
316 uint64_to_str(round_up_to_mib(
317 memory_limit), 2));
318 return;
319 }
320 }
321
322 // If the memory usage limit is only a soft limit (automatic
323 // number of threads and no --memlimit-compress), the limit
324 // is only used to reduce the number of threads and once at
325 // just one thread, the limit is completely ignored. This
326 // way -T0 won't use insane amount of memory but at the same
327 // time the soft limit will never make xz fail and never make
328 // xz change settings that would affect the compressed output.
329 if (hardware_memlimit_mtenc_is_default()) {
330 message(V_WARNING, _("Reduced the number of threads "
331 "from %s to one. The automatic memory usage "
332 "limit of %s MiB is still being exceeded. "
333 "%s MiB of memory is required. "
334 "Continuing anyway."),
335 uint64_to_str(hardware_threads_get(), 0),
336 uint64_to_str(
337 round_up_to_mib(memory_limit), 1),
338 uint64_to_str(
339 round_up_to_mib(memory_usage), 2));
340 return;
341 }
342
343 // If --no-adjust was used, we cannot drop to single-threaded
344 // mode since it produces different compressed output.
345 //
346 // NOTE: In xz 5.2.x, --no-adjust also prevented reducing
347 // the number of threads. This changed in 5.3.3alpha.
348 if (!opt_auto_adjust)
349 memlimit_too_small(memory_usage);
350
351 // Switch to single-threaded mode. It uses
352 // less memory than using one thread in
353 // the multithreaded mode but the output
354 // is also different.
355 hardware_threads_set(1);
356 memory_usage = lzma_raw_encoder_memusage(filters);
357 message(V_WARNING, _("Switching to single-threaded mode "
358 "to not exceed the memory usage limit of %s MiB"),
359 uint64_to_str(round_up_to_mib(memory_limit), 0));
360 }
361 # endif
362
363 if (memory_usage <= memory_limit)
364 return;
365
366 // Don't adjust LZMA2 or LZMA1 dictionary size if --no-adjust
367 // was specified as that would change the compressed output.
368 if (!opt_auto_adjust)
369 memlimit_too_small(memory_usage);
370
371 // Look for the last filter if it is LZMA2 or LZMA1, so we can make
372 // it use less RAM. With other filters we don't know what to do.
373 size_t i = 0;
374 while (filters[i].id != LZMA_FILTER_LZMA2
375 && filters[i].id != LZMA_FILTER_LZMA1) {
376 if (filters[i].id == LZMA_VLI_UNKNOWN)
377 memlimit_too_small(memory_usage);
378
379 ++i;
380 }
381
382 // Decrease the dictionary size until we meet the memory
383 // usage limit. First round down to full mebibytes.
384 lzma_options_lzma *opt = filters[i].options;
385 const uint32_t orig_dict_size = opt->dict_size;
386 opt->dict_size &= ~((UINT32_C(1) << 20) - 1);
387 while (true) {
388 // If it is below 1 MiB, auto-adjusting failed. We could be
389 // more sophisticated and scale it down even more, but let's
390 // see if many complain about this version.
391 //
392 // FIXME: Displays the scaled memory usage instead
393 // of the original.
394 if (opt->dict_size < (UINT32_C(1) << 20))
395 memlimit_too_small(memory_usage);
396
397 memory_usage = lzma_raw_encoder_memusage(filters);
398 if (memory_usage == UINT64_MAX)
399 message_bug();
400
401 // Accept it if it is low enough.
402 if (memory_usage <= memory_limit)
403 break;
404
405 // Otherwise 1 MiB down and try again. I hope this
406 // isn't too slow method for cases where the original
407 // dict_size is very big.
408 opt->dict_size -= UINT32_C(1) << 20;
409 }
410
411 // Tell the user that we decreased the dictionary size.
412 message(V_WARNING, _("Adjusted LZMA%c dictionary size "
413 "from %s MiB to %s MiB to not exceed "
414 "the memory usage limit of %s MiB"),
415 filters[i].id == LZMA_FILTER_LZMA2
416 ? '2' : '1',
417 uint64_to_str(orig_dict_size >> 20, 0),
418 uint64_to_str(opt->dict_size >> 20, 1),
419 uint64_to_str(round_up_to_mib(memory_limit), 2));
420 #endif
421
422 return;
423 }
424
425
426 #ifdef HAVE_DECODERS
427 /// Return true if the data in in_buf seems to be in the .xz format.
428 static bool
429 is_format_xz(void)
430 {
431 // Specify the magic as hex to be compatible with EBCDIC systems.
432 static const uint8_t magic[6] = { 0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00 };
433 return strm.avail_in >= sizeof(magic)
434 && memcmp(in_buf.u8, magic, sizeof(magic)) == 0;
435 }
436
437
438 /// Return true if the data in in_buf seems to be in the .lzma format.
439 static bool
440 is_format_lzma(void)
441 {
442 // The .lzma header is 13 bytes.
443 if (strm.avail_in < 13)
444 return false;
445
446 // Decode the LZMA1 properties.
447 lzma_filter filter = { .id = LZMA_FILTER_LZMA1 };
448 if (lzma_properties_decode(&filter, NULL, in_buf.u8, 5) != LZMA_OK)
449 return false;
450
451 // A hack to ditch tons of false positives: We allow only dictionary
452 // sizes that are 2^n or 2^n + 2^(n-1) or UINT32_MAX. LZMA_Alone
453 // created only files with 2^n, but accepts any dictionary size.
454 // If someone complains, this will be reconsidered.
455 lzma_options_lzma *opt = filter.options;
456 const uint32_t dict_size = opt->dict_size;
457 free(opt);
458
459 if (dict_size != UINT32_MAX) {
460 uint32_t d = dict_size - 1;
461 d |= d >> 2;
462 d |= d >> 3;
463 d |= d >> 4;
464 d |= d >> 8;
465 d |= d >> 16;
466 ++d;
467 if (d != dict_size || dict_size == 0)
468 return false;
469 }
470
471 // Another hack to ditch false positives: Assume that if the
472 // uncompressed size is known, it must be less than 256 GiB.
473 // Again, if someone complains, this will be reconsidered.
474 uint64_t uncompressed_size = 0;
475 for (size_t i = 0; i < 8; ++i)
476 uncompressed_size |= (uint64_t)(in_buf.u8[5 + i]) << (i * 8);
477
478 if (uncompressed_size != UINT64_MAX
479 && uncompressed_size > (UINT64_C(1) << 38))
480 return false;
481
482 return true;
483 }
484
485
486 #ifdef HAVE_LZIP_DECODER
487 /// Return true if the data in in_buf seems to be in the .lz format.
488 static bool
489 is_format_lzip(void)
490 {
491 static const uint8_t magic[4] = { 0x4C, 0x5A, 0x49, 0x50 };
492 return strm.avail_in >= sizeof(magic)
493 && memcmp(in_buf.u8, magic, sizeof(magic)) == 0;
494 }
495 #endif
496 #endif
497
498
499 /// Detect the input file type (for now, this done only when decompressing),
500 /// and initialize an appropriate coder. Return value indicates if a normal
501 /// liblzma-based coder was initialized (CODER_INIT_NORMAL), if passthru
502 /// mode should be used (CODER_INIT_PASSTHRU), or if an error occurred
503 /// (CODER_INIT_ERROR).
504 static enum coder_init_ret
505 coder_init(file_pair *pair)
506 {
507 lzma_ret ret = LZMA_PROG_ERROR;
508
509 // In most cases if there is input left when coding finishes,
510 // something has gone wrong. Exceptions are --single-stream
511 // and decoding .lz files which can contain trailing non-.lz data.
512 // These will be handled later in this function.
513 allow_trailing_input = false;
514
515 if (opt_mode == MODE_COMPRESS) {
516 #ifdef HAVE_ENCODERS
517 switch (opt_format) {
518 case FORMAT_AUTO:
519 // args.c ensures this.
520 assert(0);
521 break;
522
523 case FORMAT_XZ:
524 # ifdef MYTHREAD_ENABLED
525 if (hardware_threads_is_mt())
526 ret = lzma_stream_encoder_mt(
527 &strm, &mt_options);
528 else
529 # endif
530 ret = lzma_stream_encoder(
531 &strm, filters, check);
532 break;
533
534 case FORMAT_LZMA:
535 ret = lzma_alone_encoder(&strm, filters[0].options);
536 break;
537
538 # ifdef HAVE_LZIP_DECODER
539 case FORMAT_LZIP:
540 // args.c should disallow this.
541 assert(0);
542 ret = LZMA_PROG_ERROR;
543 break;
544 # endif
545
546 case FORMAT_RAW:
547 ret = lzma_raw_encoder(&strm, filters);
548 break;
549 }
550 #endif
551 } else {
552 #ifdef HAVE_DECODERS
553 uint32_t flags = 0;
554
555 // It seems silly to warn about unsupported check if the
556 // check won't be verified anyway due to --ignore-check.
557 if (opt_ignore_check)
558 flags |= LZMA_IGNORE_CHECK;
559 else
560 flags |= LZMA_TELL_UNSUPPORTED_CHECK;
561
562 if (opt_single_stream)
563 allow_trailing_input = true;
564 else
565 flags |= LZMA_CONCATENATED;
566
567 // We abuse FORMAT_AUTO to indicate unknown file format,
568 // for which we may consider passthru mode.
569 enum format_type init_format = FORMAT_AUTO;
570
571 switch (opt_format) {
572 case FORMAT_AUTO:
573 // .lz is checked before .lzma since .lzma detection
574 // is more complicated (no magic bytes).
575 if (is_format_xz())
576 init_format = FORMAT_XZ;
577 # ifdef HAVE_LZIP_DECODER
578 else if (is_format_lzip())
579 init_format = FORMAT_LZIP;
580 # endif
581 else if (is_format_lzma())
582 init_format = FORMAT_LZMA;
583 break;
584
585 case FORMAT_XZ:
586 if (is_format_xz())
587 init_format = FORMAT_XZ;
588 break;
589
590 case FORMAT_LZMA:
591 if (is_format_lzma())
592 init_format = FORMAT_LZMA;
593 break;
594
595 # ifdef HAVE_LZIP_DECODER
596 case FORMAT_LZIP:
597 if (is_format_lzip())
598 init_format = FORMAT_LZIP;
599 break;
600 # endif
601
602 case FORMAT_RAW:
603 init_format = FORMAT_RAW;
604 break;
605 }
606
607 switch (init_format) {
608 case FORMAT_AUTO:
609 // Unknown file format. If --decompress --stdout
610 // --force have been given, then we copy the input
611 // as is to stdout. Checking for MODE_DECOMPRESS
612 // is needed, because we don't want to do use
613 // passthru mode with --test.
614 if (opt_mode == MODE_DECOMPRESS
615 && opt_stdout && opt_force) {
616 // These are needed for progress info.
617 strm.total_in = 0;
618 strm.total_out = 0;
619 return CODER_INIT_PASSTHRU;
620 }
621
622 ret = LZMA_FORMAT_ERROR;
623 break;
624
625 case FORMAT_XZ:
626 # ifdef MYTHREAD_ENABLED
627 mt_options.flags = flags;
628
629 mt_options.threads = hardware_threads_get();
630 mt_options.memlimit_stop
631 = hardware_memlimit_get(MODE_DECOMPRESS);
632
633 // If single-threaded mode was requested, set the
634 // memlimit for threading to zero. This forces the
635 // decoder to use single-threaded mode which matches
636 // the behavior of lzma_stream_decoder().
637 //
638 // Otherwise use the limit for threaded decompression
639 // which has a sane default (users are still free to
640 // make it insanely high though).
641 mt_options.memlimit_threading
642 = mt_options.threads == 1
643 ? 0 : hardware_memlimit_mtdec_get();
644
645 ret = lzma_stream_decoder_mt(&strm, &mt_options);
646 # else
647 ret = lzma_stream_decoder(&strm,
648 hardware_memlimit_get(
649 MODE_DECOMPRESS), flags);
650 # endif
651 break;
652
653 case FORMAT_LZMA:
654 ret = lzma_alone_decoder(&strm,
655 hardware_memlimit_get(
656 MODE_DECOMPRESS));
657 break;
658
659 # ifdef HAVE_LZIP_DECODER
660 case FORMAT_LZIP:
661 allow_trailing_input = true;
662 ret = lzma_lzip_decoder(&strm,
663 hardware_memlimit_get(
664 MODE_DECOMPRESS), flags);
665 break;
666 # endif
667
668 case FORMAT_RAW:
669 // Memory usage has already been checked in
670 // coder_set_compression_settings().
671 ret = lzma_raw_decoder(&strm, filters);
672 break;
673 }
674
675 // Try to decode the headers. This will catch too low
676 // memory usage limit in case it happens in the first
677 // Block of the first Stream, which is where it very
678 // probably will happen if it is going to happen.
679 //
680 // This will also catch unsupported check type which
681 // we treat as a warning only. If there are empty
682 // concatenated Streams with unsupported check type then
683 // the message can be shown more than once here. The loop
684 // is used in case there is first a warning about
685 // unsupported check type and then the first Block
686 // would exceed the memlimit.
687 if (ret == LZMA_OK && init_format != FORMAT_RAW) {
688 strm.next_out = NULL;
689 strm.avail_out = 0;
690 while ((ret = lzma_code(&strm, LZMA_RUN))
691 == LZMA_UNSUPPORTED_CHECK)
692 message_warning(_("%s: %s"), pair->src_name,
693 message_strm(ret));
694
695 // With --single-stream lzma_code won't wait for
696 // LZMA_FINISH and thus it can return LZMA_STREAM_END
697 // if the file has no uncompressed data inside.
698 // So treat LZMA_STREAM_END as LZMA_OK here.
699 // When lzma_code() is called again in coder_normal()
700 // it will return LZMA_STREAM_END again.
701 if (ret == LZMA_STREAM_END)
702 ret = LZMA_OK;
703 }
704 #endif
705 }
706
707 if (ret != LZMA_OK) {
708 message_error(_("%s: %s"), pair->src_name, message_strm(ret));
709 if (ret == LZMA_MEMLIMIT_ERROR)
710 message_mem_needed(V_ERROR, lzma_memusage(&strm));
711
712 return CODER_INIT_ERROR;
713 }
714
715 return CODER_INIT_NORMAL;
716 }
717
718
719 /// Resolve conflicts between opt_block_size and opt_block_list in single
720 /// threaded mode. We want to default to opt_block_list, except when it is
721 /// larger than opt_block_size. If this is the case for the current Block
722 /// at *list_pos, then we break into smaller Blocks. Otherwise advance
723 /// to the next Block in opt_block_list, and break apart if needed.
724 static void
725 split_block(uint64_t *block_remaining,
726 uint64_t *next_block_remaining,
727 size_t *list_pos)
728 {
729 if (*next_block_remaining > 0) {
730 // The Block at *list_pos has previously been split up.
731 assert(!hardware_threads_is_mt());
732 assert(opt_block_size > 0);
733 assert(opt_block_list != NULL);
734
735 if (*next_block_remaining > opt_block_size) {
736 // We have to split the current Block at *list_pos
737 // into another opt_block_size length Block.
738 *block_remaining = opt_block_size;
739 } else {
740 // This is the last remaining split Block for the
741 // Block at *list_pos.
742 *block_remaining = *next_block_remaining;
743 }
744
745 *next_block_remaining -= *block_remaining;
746
747 } else {
748 // The Block at *list_pos has been finished. Go to the next
749 // entry in the list. If the end of the list has been reached,
750 // reuse the size of the last Block.
751 if (opt_block_list[*list_pos + 1] != 0)
752 ++*list_pos;
753
754 *block_remaining = opt_block_list[*list_pos];
755
756 // If in single-threaded mode, split up the Block if needed.
757 // This is not needed in multi-threaded mode because liblzma
758 // will do this due to how threaded encoding works.
759 if (!hardware_threads_is_mt() && opt_block_size > 0
760 && *block_remaining > opt_block_size) {
761 *next_block_remaining
762 = *block_remaining - opt_block_size;
763 *block_remaining = opt_block_size;
764 }
765 }
766 }
767
768
769 static bool
770 coder_write_output(file_pair *pair)
771 {
772 if (opt_mode != MODE_TEST) {
773 if (io_write(pair, &out_buf, IO_BUFFER_SIZE - strm.avail_out))
774 return true;
775 }
776
777 strm.next_out = out_buf.u8;
778 strm.avail_out = IO_BUFFER_SIZE;
779 return false;
780 }
781
782
783 /// Compress or decompress using liblzma.
784 static bool
785 coder_normal(file_pair *pair)
786 {
787 // Encoder needs to know when we have given all the input to it.
788 // The decoders need to know it too when we are using
789 // LZMA_CONCATENATED. We need to check for src_eof here, because
790 // the first input chunk has been already read if decompressing,
791 // and that may have been the only chunk we will read.
792 lzma_action action = pair->src_eof ? LZMA_FINISH : LZMA_RUN;
793
794 lzma_ret ret;
795
796 // Assume that something goes wrong.
797 bool success = false;
798
799 // block_remaining indicates how many input bytes to encode before
800 // finishing the current .xz Block. The Block size is set with
801 // --block-size=SIZE and --block-list. They have an effect only when
802 // compressing to the .xz format. If block_remaining == UINT64_MAX,
803 // only a single block is created.
804 uint64_t block_remaining = UINT64_MAX;
805
806 // next_block_remaining for when we are in single-threaded mode and
807 // the Block in --block-list is larger than the --block-size=SIZE.
808 uint64_t next_block_remaining = 0;
809
810 // Position in opt_block_list. Unused if --block-list wasn't used.
811 size_t list_pos = 0;
812
813 // Handle --block-size for single-threaded mode and the first step
814 // of --block-list.
815 if (opt_mode == MODE_COMPRESS && opt_format == FORMAT_XZ) {
816 // --block-size doesn't do anything here in threaded mode,
817 // because the threaded encoder will take care of splitting
818 // to fixed-sized Blocks.
819 if (!hardware_threads_is_mt() && opt_block_size > 0)
820 block_remaining = opt_block_size;
821
822 // If --block-list was used, start with the first size.
823 //
824 // For threaded case, --block-size specifies how big Blocks
825 // the encoder needs to be prepared to create at maximum
826 // and --block-list will simultaneously cause new Blocks
827 // to be started at specified intervals. To keep things
828 // logical, the same is done in single-threaded mode. The
829 // output is still not identical because in single-threaded
830 // mode the size info isn't written into Block Headers.
831 if (opt_block_list != NULL) {
832 if (block_remaining < opt_block_list[list_pos]) {
833 assert(!hardware_threads_is_mt());
834 next_block_remaining = opt_block_list[list_pos]
835 - block_remaining;
836 } else {
837 block_remaining = opt_block_list[list_pos];
838 }
839 }
840 }
841
842 strm.next_out = out_buf.u8;
843 strm.avail_out = IO_BUFFER_SIZE;
844
845 while (!user_abort) {
846 // Fill the input buffer if it is empty and we aren't
847 // flushing or finishing.
848 if (strm.avail_in == 0 && action == LZMA_RUN) {
849 strm.next_in = in_buf.u8;
850 strm.avail_in = io_read(pair, &in_buf,
851 my_min(block_remaining,
852 IO_BUFFER_SIZE));
853
854 if (strm.avail_in == SIZE_MAX)
855 break;
856
857 if (pair->src_eof) {
858 action = LZMA_FINISH;
859
860 } else if (block_remaining != UINT64_MAX) {
861 // Start a new Block after every
862 // opt_block_size bytes of input.
863 block_remaining -= strm.avail_in;
864 if (block_remaining == 0)
865 action = LZMA_FULL_BARRIER;
866 }
867
868 if (action == LZMA_RUN && pair->flush_needed)
869 action = LZMA_SYNC_FLUSH;
870 }
871
872 // Let liblzma do the actual work.
873 ret = lzma_code(&strm, action);
874
875 // Write out if the output buffer became full.
876 if (strm.avail_out == 0) {
877 if (coder_write_output(pair))
878 break;
879 }
880
881 if (ret == LZMA_STREAM_END && (action == LZMA_SYNC_FLUSH
882 || action == LZMA_FULL_BARRIER)) {
883 if (action == LZMA_SYNC_FLUSH) {
884 // Flushing completed. Write the pending data
885 // out immediately so that the reading side
886 // can decompress everything compressed so far.
887 if (coder_write_output(pair))
888 break;
889
890 // Mark that we haven't seen any new input
891 // since the previous flush.
892 pair->src_has_seen_input = false;
893 pair->flush_needed = false;
894 } else {
895 // Start a new Block after LZMA_FULL_BARRIER.
896 if (opt_block_list == NULL) {
897 assert(!hardware_threads_is_mt());
898 assert(opt_block_size > 0);
899 block_remaining = opt_block_size;
900 } else {
901 split_block(&block_remaining,
902 &next_block_remaining,
903 &list_pos);
904 }
905 }
906
907 // Start a new Block after LZMA_FULL_FLUSH or continue
908 // the same block after LZMA_SYNC_FLUSH.
909 action = LZMA_RUN;
910
911 } else if (ret != LZMA_OK) {
912 // Determine if the return value indicates that we
913 // won't continue coding. LZMA_NO_CHECK would be
914 // here too if LZMA_TELL_ANY_CHECK was used.
915 const bool stop = ret != LZMA_UNSUPPORTED_CHECK;
916
917 if (stop) {
918 // Write the remaining bytes even if something
919 // went wrong, because that way the user gets
920 // as much data as possible, which can be good
921 // when trying to get at least some useful
922 // data out of damaged files.
923 if (coder_write_output(pair))
924 break;
925 }
926
927 if (ret == LZMA_STREAM_END) {
928 if (allow_trailing_input) {
929 io_fix_src_pos(pair, strm.avail_in);
930 success = true;
931 break;
932 }
933
934 // Check that there is no trailing garbage.
935 // This is needed for LZMA_Alone and raw
936 // streams. This is *not* done with .lz files
937 // as that format specifically requires
938 // allowing trailing garbage.
939 if (strm.avail_in == 0 && !pair->src_eof) {
940 // Try reading one more byte.
941 // Hopefully we don't get any more
942 // input, and thus pair->src_eof
943 // becomes true.
944 strm.avail_in = io_read(
945 pair, &in_buf, 1);
946 if (strm.avail_in == SIZE_MAX)
947 break;
948
949 assert(strm.avail_in == 0
950 || strm.avail_in == 1);
951 }
952
953 if (strm.avail_in == 0) {
954 assert(pair->src_eof);
955 success = true;
956 break;
957 }
958
959 // We hadn't reached the end of the file.
960 ret = LZMA_DATA_ERROR;
961 assert(stop);
962 }
963
964 // If we get here and stop is true, something went
965 // wrong and we print an error. Otherwise it's just
966 // a warning and coding can continue.
967 if (stop) {
968 message_error(_("%s: %s"), pair->src_name,
969 message_strm(ret));
970 } else {
971 message_warning(_("%s: %s"), pair->src_name,
972 message_strm(ret));
973
974 // When compressing, all possible errors set
975 // stop to true.
976 assert(opt_mode != MODE_COMPRESS);
977 }
978
979 if (ret == LZMA_MEMLIMIT_ERROR) {
980 // Display how much memory it would have
981 // actually needed.
982 message_mem_needed(V_ERROR,
983 lzma_memusage(&strm));
984 }
985
986 if (stop)
987 break;
988 }
989
990 // Show progress information under certain conditions.
991 message_progress_update();
992 }
993
994 return success;
995 }
996
997
998 /// Copy from input file to output file without processing the data in any
999 /// way. This is used only when trying to decompress unrecognized files
1000 /// with --decompress --stdout --force, so the output is always stdout.
1001 static bool
1002 coder_passthru(file_pair *pair)
1003 {
1004 while (strm.avail_in != 0) {
1005 if (user_abort)
1006 return false;
1007
1008 if (io_write(pair, &in_buf, strm.avail_in))
1009 return false;
1010
1011 strm.total_in += strm.avail_in;
1012 strm.total_out = strm.total_in;
1013 message_progress_update();
1014
1015 strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE);
1016 if (strm.avail_in == SIZE_MAX)
1017 return false;
1018 }
1019
1020 return true;
1021 }
1022
1023
1024 extern void
1025 coder_run(const char *filename)
1026 {
1027 // Set and possibly print the filename for the progress message.
1028 message_filename(filename);
1029
1030 // Try to open the input file.
1031 file_pair *pair = io_open_src(filename);
1032 if (pair == NULL)
1033 return;
1034
1035 // Assume that something goes wrong.
1036 bool success = false;
1037
1038 if (opt_mode == MODE_COMPRESS) {
1039 strm.next_in = NULL;
1040 strm.avail_in = 0;
1041 } else {
1042 // Read the first chunk of input data. This is needed
1043 // to detect the input file type.
1044 strm.next_in = in_buf.u8;
1045 strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE);
1046 }
1047
1048 if (strm.avail_in != SIZE_MAX) {
1049 // Initialize the coder. This will detect the file format
1050 // and, in decompression or testing mode, check the memory
1051 // usage of the first Block too. This way we don't try to
1052 // open the destination file if we see that coding wouldn't
1053 // work at all anyway. This also avoids deleting the old
1054 // "target" file if --force was used.
1055 const enum coder_init_ret init_ret = coder_init(pair);
1056
1057 if (init_ret != CODER_INIT_ERROR && !user_abort) {
1058 // Don't open the destination file when --test
1059 // is used.
1060 if (opt_mode == MODE_TEST || !io_open_dest(pair)) {
1061 // Remember the current time. It is needed
1062 // for progress indicator.
1063 mytime_set_start_time();
1064
1065 // Initialize the progress indicator.
1066 //
1067 // NOTE: When reading from stdin, fstat()
1068 // isn't called on it and thus src_st.st_size
1069 // is zero. If stdin pointed to a regular
1070 // file, it would still be possible to know
1071 // the file size but then we would also need
1072 // to take into account the current reading
1073 // position since with stdin it isn't
1074 // necessarily at the beginning of the file.
1075 const bool is_passthru = init_ret
1076 == CODER_INIT_PASSTHRU;
1077 const uint64_t in_size
1078 = pair->src_st.st_size <= 0
1079 ? 0 : (uint64_t)(pair->src_st.st_size);
1080 message_progress_start(&strm,
1081 is_passthru, in_size);
1082
1083 // Do the actual coding or passthru.
1084 if (is_passthru)
1085 success = coder_passthru(pair);
1086 else
1087 success = coder_normal(pair);
1088
1089 message_progress_end(success);
1090 }
1091 }
1092 }
1093
1094 // Close the file pair. It needs to know if coding was successful to
1095 // know if the source or target file should be unlinked.
1096 io_close(pair, success);
1097
1098 return;
1099 }
1100
1101
1102 #ifndef NDEBUG
1103 extern void
1104 coder_free(void)
1105 {
1106 lzma_end(&strm);
1107 return;
1108 }
1109 #endif