# Before packing md5sum cp_25112024.txt >> manifest.txt md5sum -c manifest.txt
echo "Step 2: Split into 500MB parts" split -b 500M $OUTPUT_DIR/cleaned.txt $OUTPUT_DIR/part_ packs cp 25112024 txt better
zstd -15 cp_25112024.txt -o cp_25112024.txt.zst A .txt dump from November 25, 2024 is useless without structure. 3.1. Convert to SQLite (Single file, queryable) sqlite3 data.db CREATE TABLE logs (line TEXT); .import cp_25112024.txt logs CREATE INDEX idx_line ON logs(line(100)); Now you can SELECT instead of grep . 3.2. Use ripgrep (rg) instead of grep – 50x faster rg --stats 'pattern' cp_25112024.txt 3.3. Deduplicate lines – shrink packs instantly sort cp_25112024.txt | uniq > cp_25112024_dedup.txt For huge files use sort -S 2G to limit memory. 4. Automating the “Better” Pipeline (Example Script) Save as better_pack.sh – run on cp_25112024.txt : # Before packing md5sum cp_25112024
echo "Step 4: Generate checksum" sha256sum $OUTPUT_DIR/*.zst > $OUTPUT_DIR/checksums.txt # Before packing md5sum cp_25112024.txt >
| Format | Speed | Ratio | Best for | |--------|-------|-------|-----------| | Gzip | Fast | Good | Logs, sequential read | | Zstd | Very fast | Excellent | Real-time pipelines | | LZ4 | Instant | Medium | Near-RAM speeds | | Brotli | Slow | Best | Archival (once) |