-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathpcompress.c
3739 lines (3413 loc) · 105 KB
/
pcompress.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* [email protected], http://moinakg.wordpress.com/
*
*/
/*
* pcompress - Do a chunked parallel compression/decompression and archiving
* of one or more files.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <strings.h>
#include <limits.h>
#include <unistd.h>
#include <libgen.h>
#include <utils.h>
#include <pcompress.h>
#include <allocator.h>
#include <rabin_dedup.h>
#ifndef _MPLV2_LICENSE_
#include <lzp.h>
#endif
#include <transpose.h>
#include <delta2/delta2.h>
#include <crypto/crypto_utils.h>
#include <crypto_xsalsa20.h>
#include <ctype.h>
#include <errno.h>
#include <pc_archive.h>
#include <filters/dispack/dis.hpp>
#include "filters/dict/DictFilter.h"
/*
* We use 8MB chunks by default.
*/
#define DEFAULT_CHUNKSIZE (8 * 1024 * 1024)
#define EIGHTY_PCT(x) ((x) - ((x)/5))
struct wdata {
struct cmp_data **dary;
int wfd;
int nprocs;
int64_t chunksize;
pc_ctx_t *pctx;
};
pthread_mutex_t opt_parse = PTHREAD_MUTEX_INITIALIZER;
static void * writer_thread(void *dat);
static int init_algo(pc_ctx_t *pctx, const char *algo, int bail);
extern uint32_t lzma_crc32(const uint8_t *buf, uint64_t size, uint32_t crc);
void DLL_EXPORT
usage(pc_ctx_t *pctx)
{
fprintf(stderr,
"\nPcompress Version %s\n"
"License: %s\n\n"
"See README.md for detailed usage.\n\n"
"Standard Usage\n"
"==============\n"
" Standard usage consists of a few common options to control basic behavior with auto-\n"
" setting of various other parameters.\n\n"
" Archiving\n"
" ---------\n"
" %s -a [-v] [-l <compress level>] [-s <chunk size>] [-c <algorithm>]\n"
" [<file1> <directory1> <file2> ...] [-t <number>] [-S <chunk checksum>]\n"
" <archive filename or '-'>\n\n"
" Archives a given set of files and/or directories into a compressed PAX archive which\n"
" is then compressed.\n\n"
" -a Enables the archive mode.\n"
" -l <compress level>\n"
" Select a compression level from 1 (fast) to 14 (slow). Default: 6\n\n"
" -s <chunk size>\n"
" Specifies the maximum chunk size to split the data for parallelism. Values\n"
" can be in bytes or with suffix(k - KB, m - MB, g - GB). Default: 8m\n"
" Larger chunks can produce better compression at the cost of memory.\n\n"
" -c <algorithm>\n"
" The compression algorithm. Default algorithm when archiving is adapt2.\n"
" -v Enables verbose mode.\n\n"
" -t <number>\n"
" Sets the number of compression threads. Default: core count.\n"
" -T Disable separate metadata stream.\n"
" -S <chunk checksum>\n"
" The chunk verification checksum. Default: BLAKE256. Others are: CRC64, SHA256,\n"
" SHA512, KECCAK256, KECCAK512, BLAKE256, BLAKE512.\n"
" <archive filename>\n"
" Pathname of the resulting archive. A '.pz' extension is automatically added\n"
" if not already present. This can be '-' to output to stdout.\n\n"
" Single File Compression\n"
" -----------------------\n"
" %s -c <algorithm> [-l <compress level>] [-s <chunk size>] [-p] [<file>]\n"
" [-t <number>] [-S <chunk checksum>] [<target file or '-'>]\n\n"
" Takes a single file as input and produces a compressed file. Archiving is not performed.\n"
" This can also work in streaming mode.\n\n"
" -c <algorithm>\n"
" See above. Also see section 'Compression Algorithms' in README.md for details.\n"
" -l <compress level>\n"
" -s <chunk size>\n"
" -t <number>\n"
" -S <chunk checksum>\n"
" See above.\n"
" Note: In singe file compression mode with adapt2 or adapt algorithm, larger\n"
" chunks may not necessarily produce better compression.\n"
" -p Make Pcompress work in streaming mode. Input is stdin, output is stdout.\n\n"
" <target file>\n"
" Pathname of the compressed file to be created or '-' for stdout.\n\n"
" Decompression, Listing and Archive extraction\n"
" ---------------------------------------------\n"
" %s <-d|-i> [-m] [-K] <compressed file or '-'> [<target file or directory>]\n\n"
" -d Extract archive to target dir or current dir.\n"
" -i Only list contents of the archive, do not extract.\n\n"
" -m Enable restoring *all* permissions, ACLs, Extended Attributes etc.\n"
" Equivalent to the '-p' option in tar.\n"
" -K Do not overwrite newer files.\n"
" -m and -K are only meaningful if the compressed file is an archive. For single file\n"
" compressed mode these options are ignored.\n\n"
" <compressed file>\n"
" Specifies the compressed file or archive. This can be '-' to indicate reading\n"
" from stdin while write goes to <target file>\n\n"
" <target file or directory>\n"
" If single file compression was used then this is the output file.\n"
" Default output name if omitted: <input filename>.out\n\n"
" If Archiving was done then this should be the name of a directory into which\n"
" extracted files are restored. Default if omitted: Current directory.\n\n",
UTILITY_VERSION, LICENSE_STRING, pctx->exec_name, pctx->exec_name, pctx->exec_name);
fprintf(stderr,
" Encryption\n"
" ----------\n"
" -e <ALGO> Encrypt chunks with the given encrption algorithm. The ALGO parameter\n"
" can be one of AES or SALSA20. Both are used in CTR stream encryption\n"
" mode. The password can be prompted from the user or read from a file.\n"
" Unique keys are generated every time pcompress is run even when giving\n"
" the same password. Default key length is 256-bits (see -k below).\n"
" -w <pathname>\n"
" Provide a file which contains the encryption password. This file must\n"
" be readable and writable since it is zeroed out after the password is\n"
" read.\n"
" -k <key length>\n"
" Specify key length. Can be 16 for 128 bit or 32 for 256 bit. Default\n"
" is 32 for 256 bit keys.\n\n");
}
static void
show_compression_stats(pc_ctx_t *pctx)
{
log_msg(LOG_INFO, 0, "\nCompression Statistics");
log_msg(LOG_INFO, 0, "======================");
log_msg(LOG_INFO, 0, "Total chunks : %u", pctx->chunk_num);
if (pctx->chunk_num == 0) {
log_msg(LOG_INFO, 0, "No statistics to display.");
} else {
log_msg(LOG_INFO, 0, "Best compressed chunk : %s(%.2f%%)",
bytes_to_size(pctx->smallest_chunk),
(double)pctx->smallest_chunk/(double)pctx->chunksize*100);
log_msg(LOG_INFO, 0, "Worst compressed chunk : %s(%.2f%%)",
bytes_to_size(pctx->largest_chunk),
(double)pctx->largest_chunk/(double)pctx->chunksize*100);
pctx->avg_chunk /= pctx->chunk_num;
log_msg(LOG_INFO, 0, "Avg compressed chunk : %s(%.2f%%)\n",
bytes_to_size(pctx->avg_chunk),
(double)pctx->avg_chunk/(double)pctx->chunksize*100);
}
}
/*
* Wrapper functions to pre-process the buffer and then call the main compression routine.
*
* Byte 0: A flag to indicate which pre-processor was used.
* Byte 1 - Byte 8: Size of buffer after pre-processing
*
* It is possible for a buffer to be only pre-processed and not compressed by the final
* algorithm if the final one fails to compress for some reason. However the vice versa
* is not allowed.
*/
static int
preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t srclen,
void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data,
algo_props_t *props, int interesting)
{
uchar_t *dest = (uchar_t *)dst, type = 0;
int result;
uint64_t _dstlen, fromlen;
uchar_t *from, *to;
int stype, analyzed;
analyzer_ctx_t actx;
DEBUG_STAT_EN(double strt, en);
_dstlen = *dstlen;
from = src;
to = dst;
fromlen = srclen;
result = 0;
stype = PC_SUBTYPE(btype);
analyzed = 0;
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF ||
PC_TYPE(btype) & TYPE_TEXT || interesting) {
analyze_buffer(src, srclen, &actx);
analyzed = 1;
if (pctx->adapt_mode)
adapt_set_analyzer_ctx(data, &actx);
}
/*
* Dispack is used for 32-bit EXE files via a libarchive filter routine.
* For 64-bit exes or AR archives we apply an E8E9 CALL/JMP transform filter.
*/
if (pctx->exe_preprocess) {
int processed = 0;
if (stype == TYPE_EXE32 || stype == TYPE_EXE32_PE ||
stype == TYPE_EXE64 || stype == TYPE_ARCHIVE_AR) {
/*
* If file-level Dispack did not happen for 32-bit EXEs it was
* most likely that the file was large. So, as a workaround,
* we do raw-block Dispack here. However if even this fails to
* get any worthwhile reduction we do E8E9 as the final
* fallback.
*/
_dstlen = fromlen;
result = dispack_encode((uchar_t *)from, fromlen, to, &_dstlen);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DISPACK;
processed = 1;
}
}
if (!processed) {
_dstlen = fromlen;
memcpy(to, from, fromlen);
if (Forward_E89(to, fromlen) == 0) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_E8E9;
}
}
}
/*
* Enabling LZP also enables the DICT filter since we are dealing with text
* in any case.
*/
if (pctx->lzp_preprocess) {
int b_type;
b_type = btype;
if (analyzed) {
b_type = actx.ten_pct.btype;
} else {
b_type = analyze_buffer_simple(from, fromlen);
}
if (PC_TYPE(b_type) & TYPE_TEXT) {
_dstlen = fromlen;
result = dict_encode(from, fromlen, to, &_dstlen, (stype == TYPE_DNA_SEQ));
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DICT;
}
}
}
#ifndef _MPLV2_LICENSE_
if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) {
int hashsize, b_type;
int64_t result;
b_type = btype;
if (analyzed)
b_type = actx.thirty_pct.btype;
if (!(PC_TYPE(b_type) & TYPE_BINARY)) {
hashsize = lzp_hash_size(level);
result = lzp_compress((const uchar_t *)from, to, fromlen,
hashsize, LZP_DEFAULT_LZPMINLEN, 0);
if (result >= 0 && result < srclen) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = result;
type |= PREPROC_TYPE_LZP;
}
}
}
#endif
if (pctx->enable_delta2_encode && props->delta2_span > 0 &&
stype != TYPE_DNA_SEQ && stype != TYPE_BMP &&
stype != TYPE_TIFF && stype != TYPE_MP4) {
int b_type;
b_type = btype;
if (analyzed)
b_type = actx.ten_pct.btype;
if (!(PC_TYPE(b_type) & TYPE_TEXT)) {
_dstlen = fromlen;
result = delta2_encode((uchar_t *)from, fromlen, to,
&_dstlen, props->delta2_span,
pctx->delta2_nstrides);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DELTA2;
}
}
}
/*
* Check which is the resulting buffer. If Encoded data is already sitting
* in src buffer then a memcpy() is not needed.
* Note that from,to ptrs are swapped after every encoding stage. So if
* from == dst, it means that encoded data is in dst.
*/
if (from == dst) {
memcpy(src, dst, fromlen);
}
srclen = fromlen;
*dest = type;
U64_P(dest + 1) = htonll(srclen);
_dstlen = srclen;
DEBUG_STAT_EN(strt = get_wtime_millis());
result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr,
btype, data);
DEBUG_STAT_EN(en = get_wtime_millis());
if (result > -1 && _dstlen < srclen) {
*dest |= PREPROC_COMPRESSED;
*dstlen = _dstlen + 9;
DEBUG_STAT_EN(fprintf(stderr, "Chunk compression speed %.3f MB/s\n",
get_mb_s(srclen, strt, en)));
} else {
DEBUG_STAT_EN(fprintf(stderr, "Chunk did not compress.\n"));
/*
* If compression failed but one of the pre-processing succeeded then
* type flags will be non-zero. In that case we still indicate a success
* result so that decompression will reverse the pre-processing. The
* type flags will indicate that compression was not done and the
* decompress routine will not be called.
*/
if (type > 0) {
memcpy(dest+1, src, srclen);
*dstlen = srclen + 1;
result = 0;
} else {
/*
* Not Compressed and not preprocessed. Ensure that we signal
* error upstream, so this is handled correctly.
*/
result = -1;
}
}
return (result);
}
static int
preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64_t srclen,
void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data,
algo_props_t *props)
{
uchar_t *sorc = (uchar_t *)src, type;
int result;
uint64_t _dstlen = *dstlen, _dstlen1 = *dstlen;
DEBUG_STAT_EN(double strt, en);
type = *sorc;
++sorc;
--srclen;
if (type & PREPROC_COMPRESSED) {
*dstlen = ntohll(U64_P(sorc));
sorc += 8;
srclen -= 8;
DEBUG_STAT_EN(strt = get_wtime_millis());
result = dec_func(sorc, srclen, dst, dstlen, level, chdr, btype, data);
DEBUG_STAT_EN(en = get_wtime_millis());
if (result < 0) return (result);
DEBUG_STAT_EN(fprintf(stderr, "Chunk decompression speed %.3f MB/s\n",
get_mb_s(srclen, strt, en)));
memcpy(src, dst, *dstlen);
srclen = *dstlen;
} else {
src = sorc;
}
if (type & PREPROC_TYPE_DELTA2) {
result = delta2_decode((uchar_t *)src, srclen, (uchar_t *)dst, &_dstlen);
if (result != -1) {
memcpy(src, dst, _dstlen);
srclen = _dstlen;
*dstlen = _dstlen;
_dstlen = _dstlen1;
} else {
log_msg(LOG_ERR, 0, "Delta2 decoding failed.");
return (result);
}
}
if (type & PREPROC_TYPE_LZP) {
#ifndef _MPLV2_LICENSE_
int hashsize;
int64_t result;
hashsize = lzp_hash_size(level);
result = lzp_decompress((const uchar_t *)src, (uchar_t *)dst, srclen,
hashsize, LZP_DEFAULT_LZPMINLEN, 0);
if (result > 0) {
memcpy(src, dst, result);
srclen = result;
*dstlen = result;
} else {
log_msg(LOG_ERR, 0, "LZP decompression failed.");
return ((int)result);
}
#else
log_msg(LOG_ERR, 0, "LZP feature not available in this build"
" (MPLv2). Aborting.");
return (-1);
#endif
}
if (type & PREPROC_TYPE_DICT) {
result = dict_decode(src, srclen, dst, &_dstlen);
if (result != -1) {
memcpy(src, dst, _dstlen);
srclen = _dstlen;
*dstlen = _dstlen;
_dstlen = _dstlen1;
} else {
log_msg(LOG_ERR, 0, "DICT decoding failed.");
return (result);
}
}
if (type & PREPROC_TYPE_E8E9) {
_dstlen1 = srclen;
memcpy(dst, src, srclen);
result = Inverse_E89(dst, srclen);
if (result != -1) {
*dstlen = _dstlen1;
} else {
log_msg(LOG_ERR, 0, "E8E9 decoding failed.");
return (result);
}
} else if (type & PREPROC_TYPE_DISPACK) { // Backward compatibility
result = dispack_decode((uchar_t *)src, srclen, (uchar_t *)dst, &_dstlen1);
if (result != -1) {
*dstlen = _dstlen1;
} else {
log_msg(LOG_ERR, 0, "Dispack decoding failed.");
return (result);
}
}
if (!(type & (PREPROC_COMPRESSED|PREPROC_TYPE_DELTA2|PREPROC_TYPE_LZP|
PREPROC_TYPE_DISPACK|PREPROC_TYPE_DICT|PREPROC_TYPE_E8E9))
&& type > 0) {
log_msg(LOG_ERR, 0, "Invalid preprocessing flags: %d", type);
return (-1);
}
return (0);
}
/*
* This routine is called in multiple threads. Calls the decompression handler
* as encoded in the file header. For adaptive mode the handler adapt_decompress()
* in turns looks at the chunk header and calls the actual decompression
* routine.
*/
static void *
perform_decompress(void *dat)
{
struct cmp_data *tdat = (struct cmp_data *)dat;
uint64_t _chunksize;
uint64_t dedupe_index_sz, dedupe_data_sz, dedupe_index_sz_cmp, dedupe_data_sz_cmp;
int rv = 0;
unsigned int blknum;
uchar_t checksum[CKSUM_MAX_BYTES];
uchar_t HDR;
uchar_t *cseg;
pc_ctx_t *pctx;
pctx = tdat->pctx;
redo:
Sem_Wait(&tdat->start_sem);
if (pctx->main_cancel)
return (NULL);
if (unlikely(tdat->cancel)) {
tdat->len_cmp = 0;
Sem_Post(&tdat->cmp_done_sem);
return (0);
}
/*
* If the last read returned a 0 quit.
*/
if (tdat->rbytes == 0) {
tdat->len_cmp = 0;
goto cont;
}
cseg = tdat->compressed_chunk + pctx->cksum_bytes + pctx->mac_bytes;
HDR = *cseg;
cseg += CHUNK_FLAG_SZ;
_chunksize = tdat->chunksize;
if (HDR & CHSIZE_MASK) {
uchar_t *rseg;
tdat->rbytes -= ORIGINAL_CHUNKSZ;
tdat->len_cmp -= ORIGINAL_CHUNKSZ;
rseg = tdat->compressed_chunk + tdat->rbytes;
_chunksize = ntohll(*((int64_t *)rseg));
}
/*
* If this was encrypted:
* Verify HMAC first before anything else and then decrypt compressed data.
*/
if (pctx->encrypt_type) {
unsigned int len;
DEBUG_STAT_EN(double strt, en);
DEBUG_STAT_EN(strt = get_wtime_millis());
len = pctx->mac_bytes;
deserialize_checksum(checksum, tdat->compressed_chunk + pctx->cksum_bytes,
pctx->mac_bytes);
memset(tdat->compressed_chunk + pctx->cksum_bytes, 0, pctx->mac_bytes);
hmac_reinit(&tdat->chunk_hmac);
hmac_update(&tdat->chunk_hmac, (uchar_t *)&tdat->len_cmp_be, sizeof (tdat->len_cmp_be));
hmac_update(&tdat->chunk_hmac, tdat->compressed_chunk, tdat->rbytes);
if (HDR & CHSIZE_MASK) {
uchar_t *rseg;
rseg = tdat->compressed_chunk + tdat->rbytes;
hmac_update(&tdat->chunk_hmac, rseg, ORIGINAL_CHUNKSZ);
}
hmac_final(&tdat->chunk_hmac, tdat->checksum, &len);
if (memcmp(checksum, tdat->checksum, len) != 0) {
/*
* HMAC verification failure is fatal.
*/
log_msg(LOG_ERR, 0, "Chunk %d, HMAC verification failed", tdat->id);
pctx->main_cancel = 1;
tdat->len_cmp = 0;
pctx->t_errored = 1;
Sem_Post(&tdat->cmp_done_sem);
return (NULL);
}
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "HMAC Verification speed %.3f MB/s",
get_mb_s(tdat->rbytes + sizeof (tdat->len_cmp_be), strt, en)));
/*
* Encryption algorithm should not change the size and
* encryption is in-place.
*/
DEBUG_STAT_EN(strt = get_wtime_millis());
rv = crypto_buf(&(pctx->crypto_ctx), cseg, cseg, tdat->len_cmp, tdat->id);
if (rv == -1) {
/*
* Decryption failure is fatal.
*/
log_msg(LOG_ERR, 0, "Chunk %d, Decryption failed", tdat->id);
pctx->main_cancel = 1;
tdat->len_cmp = 0;
Sem_Post(&tdat->cmp_done_sem);
return (NULL);
}
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "Decryption speed %.3f MB/s\n",
get_mb_s(tdat->len_cmp, strt, en)));
} else if (pctx->mac_bytes > 0) {
/*
* Verify header CRC32 in non-crypto mode.
*/
uint32_t crc1, crc2;
crc1 = htonl(U32_P(tdat->compressed_chunk + pctx->cksum_bytes));
memset(tdat->compressed_chunk + pctx->cksum_bytes, 0, pctx->mac_bytes);
crc2 = lzma_crc32((uchar_t *)&tdat->len_cmp_be, sizeof (tdat->len_cmp_be), 0);
crc2 = lzma_crc32(tdat->compressed_chunk,
pctx->cksum_bytes + pctx->mac_bytes + CHUNK_FLAG_SZ, crc2);
if (HDR & CHSIZE_MASK) {
uchar_t *rseg;
rseg = tdat->compressed_chunk + tdat->rbytes;
crc2 = lzma_crc32(rseg, ORIGINAL_CHUNKSZ, crc2);
}
if (crc1 != crc2) {
/*
* Header CRC32 verification failure is fatal.
*/
log_msg(LOG_ERR, 0, "Chunk %d, Header CRC verification failed", tdat->id);
pctx->main_cancel = 1;
tdat->len_cmp = 0;
pctx->t_errored = 1;
Sem_Post(&tdat->cmp_done_sem);
return (NULL);
}
/*
* Now that header CRC32 was verified, recover the stored message
* digest.
*/
deserialize_checksum(tdat->checksum, tdat->compressed_chunk, pctx->cksum_bytes);
}
if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan || pctx->enable_rabin_global) &&
(HDR & CHUNK_FLAG_DEDUP)) {
uchar_t *cmpbuf, *ubuf;
/* Extract various sizes from dedupe header. */
parse_dedupe_hdr(cseg, &blknum, &dedupe_index_sz, &dedupe_data_sz,
&dedupe_index_sz_cmp, &dedupe_data_sz_cmp, &_chunksize);
memcpy(tdat->uncompressed_chunk, cseg, RABIN_HDR_SIZE);
/*
* Uncompress the data chunk first and then uncompress the index.
* The uncompress routines can use extra bytes at the end for temporary
* state/dictionary info. Since data chunk directly follows index
* uncompressing index first corrupts the data.
*/
cmpbuf = cseg + RABIN_HDR_SIZE + dedupe_index_sz_cmp;
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE + dedupe_index_sz;
if (HDR & COMPRESSED) {
if (HDR & CHUNK_FLAG_PREPROC) {
rv = preproc_decompress(pctx, tdat->decompress, cmpbuf,
dedupe_data_sz_cmp, ubuf, &_chunksize, tdat->level,
HDR, pctx->btype, tdat->data, tdat->props);
} else {
DEBUG_STAT_EN(double strt, en);
DEBUG_STAT_EN(strt = get_wtime_millis());
rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize,
tdat->level, HDR, pctx->btype, tdat->data);
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "Chunk %d decompression speed %.3f MB/s\n",
tdat->id, get_mb_s(_chunksize, strt, en)));
}
if (rv == -1) {
tdat->len_cmp = 0;
log_msg(LOG_ERR, 0, "ERROR: Chunk %d, decompression failed.", tdat->id);
pctx->t_errored = 1;
goto cont;
}
} else {
memcpy(ubuf, cmpbuf, _chunksize);
}
rv = 0;
cmpbuf = cseg + RABIN_HDR_SIZE;
ubuf = tdat->uncompressed_chunk + RABIN_HDR_SIZE;
if (dedupe_index_sz >= 90 && dedupe_index_sz > dedupe_index_sz_cmp) {
/* Index should be at least 90 bytes to have been compressed. */
rv = lzma_decompress(cmpbuf, dedupe_index_sz_cmp, ubuf,
&dedupe_index_sz, tdat->rctx->level, 0, TYPE_BINARY, tdat->rctx->lzma_data);
} else {
memcpy(ubuf, cmpbuf, dedupe_index_sz);
}
/*
* Recover from transposed index.
*/
transpose(ubuf, cmpbuf, dedupe_index_sz, sizeof (uint32_t), COL);
memcpy(ubuf, cmpbuf, dedupe_index_sz);
} else {
if (HDR & COMPRESSED) {
if (HDR & CHUNK_FLAG_PREPROC) {
rv = preproc_decompress(pctx, tdat->decompress, cseg, tdat->len_cmp,
tdat->uncompressed_chunk, &_chunksize, tdat->level, HDR, pctx->btype,
tdat->data, tdat->props);
} else {
DEBUG_STAT_EN(double strt, en);
DEBUG_STAT_EN(strt = get_wtime_millis());
rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk,
&_chunksize, tdat->level, HDR, pctx->btype, tdat->data);
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "Chunk decompression speed %.3f MB/s\n",
get_mb_s(_chunksize, strt, en)));
}
} else {
memcpy(tdat->uncompressed_chunk, cseg, _chunksize);
}
}
tdat->len_cmp = _chunksize;
if (rv == -1) {
tdat->len_cmp = 0;
log_msg(LOG_ERR, 0, "ERROR: Chunk %d, decompression failed.", tdat->id);
pctx->t_errored = 1;
goto cont;
}
/* Rebuild chunk from dedup blocks. */
if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan) && (HDR & CHUNK_FLAG_DEDUP)) {
dedupe_context_t *rctx;
uchar_t *tmp;
rctx = tdat->rctx;
reset_dedupe_context(tdat->rctx);
rctx->cbuf = tdat->compressed_chunk;
dedupe_decompress(rctx, tdat->uncompressed_chunk, &(tdat->len_cmp));
if (!rctx->valid) {
log_msg(LOG_ERR, 0, "ERROR: Chunk %d, dedup recovery failed.", tdat->id);
rv = -1;
tdat->len_cmp = 0;
pctx->t_errored = 1;
goto cont;
}
_chunksize = tdat->len_cmp;
tmp = tdat->uncompressed_chunk;
tdat->uncompressed_chunk = tdat->compressed_chunk;
tdat->compressed_chunk = tmp;
tdat->cmp_seg = tdat->uncompressed_chunk;
} else {
/*
* This chunk was not deduplicated, however we still need to down the
* semaphore in order to maintain proper thread coordination. We do this after
* decompression to achieve better concurrency. Decompression does not need
* to wait for the previous thread's dedupe recovery to complete.
*/
if (pctx->enable_rabin_global) {
Sem_Wait(tdat->rctx->index_sem);
}
}
if (!pctx->encrypt_type) {
/*
* Re-compute checksum of original uncompressed chunk.
* If it does not match we set length of chunk to 0 to indicate
* exit to the writer thread.
*/
compute_checksum(checksum, pctx->cksum, tdat->uncompressed_chunk,
_chunksize, tdat->cksum_mt, 1);
if (memcmp(checksum, tdat->checksum, pctx->cksum_bytes) != 0) {
tdat->len_cmp = 0;
log_msg(LOG_ERR, 0, "ERROR: Chunk %d, checksums do not match.", tdat->id);
pctx->t_errored = 1;
pctx->main_cancel = 1;
}
}
cont:
Sem_Post(&tdat->cmp_done_sem);
if (!pctx->t_errored)
goto redo;
return (NULL);
}
/*
* File decompression routine.
*
* Compressed file Format
* ----------------------
* File Header:
* Algorithm string: 8 bytes.
* Version number: 2 bytes.
* Global Flags: 2 bytes.
* Chunk size: 8 bytes.
* Compression Level: 4 bytes.
*
* Chunk Header:
* Compressed length: 8 bytes.
* Checksum: Upto 64 bytes.
* Chunk flags: 1 byte.
*
* Chunk Flags, 8 bits:
* I I I I I I I I
* | | | | | |
* | '-----' | | `- 0 - Uncompressed
* | | | | 1 - Compressed
* | | | |
* | | | `---- 1 - Chunk was Deduped
* | | `------- 1 - Chunk was pre-compressed
* | |
* | | 1 - Bzip2 (Adaptive Mode)
* | `---------------- 2 - Lzma (Adaptive Mode)
* | 3 - PPMD (Adaptive Mode)
* |
* `---------------------- 1 - Chunk size flag (if original chunk is of variable length)
*
* A file trailer to indicate end.
* Zero Compressed length: 8 zero bytes.
*/
#define UNCOMP_BAIL err = 1; goto uncomp_done
int DLL_EXPORT
start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
{
char algorithm[ALGO_SZ];
struct stat sbuf;
struct wdata w;
int compfd = -1, compfd2 = -1, p, dedupe_flag;
int uncompfd = -1, err, np, bail;
int thread = 0, level;
uint32_t nprocs = 1, i;
unsigned short version, flags;
int64_t chunksize, compressed_chunksize;
struct cmp_data **dary, *tdat;
pthread_t writer_thr;
algo_props_t props;
err = 0;
flags = 0;
thread = 0;
dary = NULL;
init_algo_props(&props);
/*
* Open files and do sanity checks.
*/
if (!pctx->pipe_mode) {
if (filename == NULL) {
pctx->pipe_mode = 1;
compfd = fileno(stdin);
if (compfd == -1) {
log_msg(LOG_ERR, 1, "fileno ");
UNCOMP_BAIL;
}
sbuf.st_size = 0;
} else {
if ((compfd = open(filename, O_RDONLY, 0)) == -1) {
log_msg(LOG_ERR, 1, "Cannot open: %s", filename);
return (1);
}
if (fstat(compfd, &sbuf) == -1) {
log_msg(LOG_ERR, 1, "Cannot stat: %s", filename);
return (1);
}
if (sbuf.st_size == 0)
return (1);
}
} else {
compfd = fileno(stdin);
if (compfd == -1) {
log_msg(LOG_ERR, 1, "fileno ");
UNCOMP_BAIL;
}
}
/*
* Read file header pieces and verify.
*/
if (Read(compfd, algorithm, ALGO_SZ) < ALGO_SZ) {
log_msg(LOG_ERR, 1, "Read: ");
UNCOMP_BAIL;
}
if (init_algo(pctx, algorithm, 0) != 0) {
if (pctx->pipe_mode || filename == NULL)
log_msg(LOG_ERR, 0, "Input stream is not pcompressed.");
else
log_msg(LOG_ERR, 0, "%s is not a pcompressed file.", filename);
UNCOMP_BAIL;
}
pctx->algo = algorithm;
if (Read(compfd, &version, sizeof (version)) < sizeof (version) ||
Read(compfd, &flags, sizeof (flags)) < sizeof (flags) ||
Read(compfd, &chunksize, sizeof (chunksize)) < sizeof (chunksize) ||
Read(compfd, &level, sizeof (level)) < sizeof (level)) {
log_msg(LOG_ERR, 1, "Read: ");
UNCOMP_BAIL;
}
version = ntohs(version);
flags = ntohs(flags);
chunksize = ntohll(chunksize);
level = ntohl(level);
/*
* Check for ridiculous values (malicious tampering or otherwise).
*/
if (version > VERSION) {
log_msg(LOG_ERR, 0, "Cannot handle newer archive version %d, capability %d",
version, VERSION);
err = 1;
goto uncomp_done;
}
if (chunksize > EIGHTY_PCT(get_total_ram())) {
log_msg(LOG_ERR, 0, "Chunk size must not exceed 80%% of total RAM.");
err = 1;
goto uncomp_done;
}
if (level > MAX_LEVEL || level < 0) {
log_msg(LOG_ERR, 0, "Invalid compression level in header: %d", level);
err = 1;
goto uncomp_done;
}
if (version < VERSION-4) {
log_msg(LOG_ERR, 0, "Unsupported version: %d", version);
err = 1;
goto uncomp_done;
}
/*
* First check for archive mode. In that case the to_filename must be a directory.
*/
if (flags & FLAG_ARCHIVE) {
if (flags & FLAG_META_STREAM && version > 9)
pctx->meta_stream = 1;
/*
* Archives with metadata streams cannot be decoded in pipe mode.
*/
if (pctx->pipe_mode && pctx->meta_stream) {
log_msg(LOG_ERR, 0,
"Cannot extract archive with metadata stream in pipe mode.");
}
/*
* If to_filename is not set, we just use the current directory.
*/
if (to_filename == NULL) {
to_filename = ".";
pctx->to_filename = ".";
}
pctx->archive_mode = 1;
if (stat(to_filename, &sbuf) == -1) {
if (errno != ENOENT) {
log_msg(LOG_ERR, 1, "Target path is not a directory.");
err = 1;
goto uncomp_done;
}
if (mkdir(to_filename,
S_IRUSR|S_IWUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH) == -1) {
log_msg(LOG_ERR, 1, "Unable to create target directory %s.",
to_filename);
err = 1;
goto uncomp_done;
}
if (stat(to_filename, &sbuf) == -1) {
log_msg(LOG_ERR, 1, "Unable to correctly create target directory %s.",
to_filename);
err = 1;
goto uncomp_done;
}
}
if (!S_ISDIR(sbuf.st_mode)) {
log_msg(LOG_ERR, 0, "Target path is not a directory.", to_filename);
err = 1;
goto uncomp_done;
}
/*
* Open another fd to the compressed archive. This is used by the metadata
* thread.
*/
if (pctx->meta_stream) {
if ((compfd2 = open(filename, O_RDONLY, 0)) == -1) {
log_msg(LOG_ERR, 1, "Cannot open: %s", filename);