Jsou tam sekvence znaku, ktery rozhodi line-based utility:
~/tool # nix --extra-experimental-features nix-command --extra-experimental-features flakes run . -- /tank/ct/24008/private/var/log/app/production.log --block-size 32MiB --merge-gap 64 --min-span 4
warning: creating lock file '"/root/tool/flake.lock"':
• Added input 'nixpkgs':
'github:NixOS/nixpkgs/b134951a4c9f3c995fd7be05f3243f8ecd65d798?narHash=sha256-OnSAY7XDSx7CtDoqNh8jwVwh4xNL/2HaJxGjryLWzX8%3D' (2024-12-30)
File size: 27,413,790,756 bytes
Found 22 suspicious span(s):
[1] start=10,703,050,310 end=10,703,050,458 length=148 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703050310 count=148 status=none | hexdump -C
Tail after span: tail -c +10703050459 YOURFILE | tail -n 200
[2] start=10,703,050,566 end=10,703,050,571 length=5 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703050566 count=5 status=none | hexdump -C
Tail after span: tail -c +10703050572 YOURFILE | tail -n 200
[3] start=10,703,050,698 end=10,703,050,703 length=5 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703050698 count=5 status=none | hexdump -C
Tail after span: tail -c +10703050704 YOURFILE | tail -n 200
[4] start=10,703,050,824 end=10,703,050,835 length=11 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703050824 count=11 status=none | hexdump -C
Tail after span: tail -c +10703050836 YOURFILE | tail -n 200
[5] start=10,703,050,903 end=10,703,050,909 length=6 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703050903 count=6 status=none | hexdump -C
Tail after span: tail -c +10703050910 YOURFILE | tail -n 200
[6] start=10,703,050,987 end=10,703,050,998 length=11 bytes reasons=[ASCII-CTRL,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703050987 count=11 status=none | hexdump -C
Tail after span: tail -c +10703050999 YOURFILE | tail -n 200
[7] start=10,703,051,067 end=10,703,051,073 length=6 bytes reasons=[ASCII-CTRL,INVALID-UTF8]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703051067 count=6 status=none | hexdump -C
Tail after span: tail -c +10703051074 YOURFILE | tail -n 200
[8] start=10,703,051,180 end=10,703,051,184 length=4 bytes reasons=[ASCII-CTRL,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703051180 count=4 status=none | hexdump -C
Tail after span: tail -c +10703051185 YOURFILE | tail -n 200
[9] start=10,703,051,441 end=10,703,051,521 length=80 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703051441 count=80 status=none | hexdump -C
Tail after span: tail -c +10703051522 YOURFILE | tail -n 200
[10] start=10,703,051,707 end=10,703,051,717 length=10 bytes reasons=[ASCII-CTRL,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703051707 count=10 status=none | hexdump -C
Tail after span: tail -c +10703051718 YOURFILE | tail -n 200
[11] start=10,703,051,852 end=10,703,051,996 length=144 bytes reasons=[ASCII-CTRL,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703051852 count=144 status=none | hexdump -C
Tail after span: tail -c +10703051997 YOURFILE | tail -n 200
[12] start=10,703,052,507 end=10,703,052,655 length=148 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703052507 count=148 status=none | hexdump -C
Tail after span: tail -c +10703052656 YOURFILE | tail -n 200
[13] start=10,703,052,763 end=10,703,052,768 length=5 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703052763 count=5 status=none | hexdump -C
Tail after span: tail -c +10703052769 YOURFILE | tail -n 200
[14] start=10,703,052,895 end=10,703,052,900 length=5 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703052895 count=5 status=none | hexdump -C
Tail after span: tail -c +10703052901 YOURFILE | tail -n 200
[15] start=10,703,053,021 end=10,703,053,032 length=11 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703053021 count=11 status=none | hexdump -C
Tail after span: tail -c +10703053033 YOURFILE | tail -n 200
[16] start=10,703,053,100 end=10,703,053,106 length=6 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703053100 count=6 status=none | hexdump -C
Tail after span: tail -c +10703053107 YOURFILE | tail -n 200
[17] start=10,703,053,184 end=10,703,053,195 length=11 bytes reasons=[ASCII-CTRL,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703053184 count=11 status=none | hexdump -C
Tail after span: tail -c +10703053196 YOURFILE | tail -n 200
[18] start=10,703,053,264 end=10,703,053,270 length=6 bytes reasons=[ASCII-CTRL,INVALID-UTF8]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703053264 count=6 status=none | hexdump -C
Tail after span: tail -c +10703053271 YOURFILE | tail -n 200
[19] start=10,703,053,377 end=10,703,053,381 length=4 bytes reasons=[ASCII-CTRL,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703053377 count=4 status=none | hexdump -C
Tail after span: tail -c +10703053382 YOURFILE | tail -n 200
[20] start=10,703,053,638 end=10,703,053,718 length=80 bytes reasons=[ASCII-CTRL,INVALID-UTF8,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703053638 count=80 status=none | hexdump -C
Tail after span: tail -c +10703053719 YOURFILE | tail -n 200
[21] start=10,703,053,904 end=10,703,053,914 length=10 bytes reasons=[ASCII-CTRL,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703053904 count=10 status=none | hexdump -C
Tail after span: tail -c +10703053915 YOURFILE | tail -n 200
[22] start=10,703,054,049 end=10,703,054,193 length=144 bytes reasons=[ASCII-CTRL,NUL]
Inspect bytes: dd if=YOURFILE of=/dev/stdout bs=1 skip=10703054049 count=144 status=none | hexdump -C
Tail after span: tail -c +10703054194 YOURFILE | tail -n 200
Approx bytes by reason:
ASCII-CTRL ~ 860 bytes
NUL ~ 848 bytes
INVALID-UTF8 ~ 522 bytes
Delal jsem to timhle (flake.nix, je v nem schovana python utilita, kterou napsalo GPT)
{
description = "find-bad-spans (single-file flake): detect binary/corrupt spans in huge logs with byte offsets";
# You can change this to nixos-unstable if you prefer.
inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.05";
outputs = { self, nixpkgs }:
let
systems = [ "x86_64-linux" "aarch64-linux" ];
forAllSystems = f: nixpkgs.lib.genAttrs systems (system: f system);
# Embedded Python (no '' or b'' so it doesn't break Nix's indented string quoting)
pySource = ''
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
find_bad_spans.py — Stream a large text-ish file and report binary/corrupt spans with byte offsets.
Detects:
- NUL bytes
- ASCII control bytes (except TAB/LF/CR)
- Invalid UTF-8 sequences
It processes the file in chunks, validates UTF-8 using a bytes-regex for known valid
sequences, and reports the "holes" (i.e., anything not matching valid text).
It also carries incomplete UTF-8 sequences across chunk boundaries so they are not
misclassified.
Usage:
find-bad-spans /path/to/logfile \
--block-size 8MiB \
--merge-gap 0 \
--min-span 1 \
--report human
"""
import argparse
import os
import re
import sys
import json
# Regex for a sequence of valid text bytes:
# - ASCII printable + TAB/LF/CR
# - Valid UTF-8 (non-overlong, no surrogates, up to U+10FFFF)
VALID_TEXT_RE = re.compile(
rb'(?:'
rb'[\x09\x0A\x0D\x20-\x7E]' # ASCII (tab, lf, cr, and printable)
rb'|[\xC2-\xDF][\x80-\xBF]' # 2-byte
rb'|\xE0[\xA0-\xBF][\x80-\xBF]' # 3-byte, E0
rb'|[\xE1-\xEC][\x80-\xBF]{2}' # 3-byte, E1–EC
rb'|\xED[\x80-\x9F][\x80-\xBF]' # 3-byte, ED (no surrogates)
rb'|[\xEE-\xEF][\x80-\xBF]{2}' # 3-byte, EE–EF
rb'|\xF0[\x90-\xBF][\x80-\xBF]{2}' # 4-byte, F0
rb'|[\xF1-\xF3][\x80-\xBF]{3}' # 4-byte, F1–F3
rb'|\xF4[\x80-\x8F][\x80-\xBF]{2}' # 4-byte, F4 (<= U+10FFFF)
rb')+'
)
CTRL_RE = re.compile(rb'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')
HIGHBIT_RE = re.compile(rb'[\x80-\xFF]')
def parse_size(s: str) -> int:
s = s.strip().lower().replace("ib", "").replace("b", "")
mult = 1
for suf, m in (("k", 1024), ("m", 1024**2), ("g", 1024**3)):
if s.endswith(suf):
mult = m
s = s[:-1]
break
return int(float(s) * mult)
def compute_carry_tail(buf: bytes) -> int:
"""
Return how many trailing bytes to carry over to the next chunk,
so we never split a UTF-8 sequence between chunks.
"""
n = len(buf)
if n == 0:
return 0
i = n - 1
cont = 0
# Count up to 3 trailing continuation bytes 0x80–0xBF
while cont < 3 and i >= 0 and 0x80 <= buf[i] <= 0xBF:
cont += 1
i -= 1
if i < 0:
# Entire chunk (or last few bytes) are continuations; carry a few to be safe.
return min(n, 3)
start = buf[i]
if 0xC2 <= start <= 0xDF:
need = 1
elif 0xE0 <= start <= 0xEF:
need = 2
elif 0xF0 <= start <= 0xF4:
need = 3
else:
return 0 # ASCII or invalid start doesn't need carry
return (cont + 1) if cont < need else 0
def find_invalid_segments(buf: bytes):
"""
Return list of (start, end) pairs (0-based relative to 'buf') for bytes
that do NOT match valid text per VALID_TEXT_RE.
"""
invalid = []
pos = 0
for m in VALID_TEXT_RE.finditer(buf):
s, e = m.span()
if s > pos:
invalid.append((pos, s))
pos = e
if pos < len(buf):
invalid.append((pos, len(buf)))
return invalid
def reasons_for(segment: bytes):
reasons = set()
if b'\x00' in segment:
reasons.add('NUL')
if CTRL_RE.search(segment):
reasons.add('ASCII-CTRL')
if HIGHBIT_RE.search(segment):
reasons.add('INVALID-UTF8')
return sorted(reasons)
def merge_or_append(spans, start, end, reasons, merge_gap):
"""
Merge with previous span if adjacent or within merge_gap.
"""
if spans and start - spans[-1]['end'] <= merge_gap:
# merge
spans[-1]['end'] = end
spans[-1]['length'] = spans[-1]['end'] - spans[-1]['start']
spans[-1]['reasons'].update(reasons)
else:
spans.append({
'start': start,
'end': end,
'length': end - start,
'reasons': set(reasons),
})
def scan_file(path, block_size, merge_gap, min_span):
spans = []
base = 0 # absolute offset for the beginning of current buffer
carry = b""
total_size = os.path.getsize(path)
with open(path, 'rb', buffering=0) as f:
while True:
chunk = f.read(block_size)
if not chunk:
if carry:
safe = carry # process whatever remains
for s, e in find_invalid_segments(safe):
if e - s >= min_span:
merge_or_append(spans, base + s, base + e, reasons_for(safe[s:e]), merge_gap)
base += len(safe)
carry = b""
break
buf = carry + chunk
carry_len = compute_carry_tail(buf)
safe_len = len(buf) - carry_len
safe = buf[:safe_len]
for s, e in find_invalid_segments(safe):
if e - s >= min_span:
merge_or_append(spans, base + s, base + e, reasons_for(safe[s:e]), merge_gap)
base += safe_len
carry = buf[safe_len:]
# Normalize reasons to lists
for sp in spans:
sp['reasons'] = sorted(sp['reasons'])
return spans, total_size
def print_human(spans, total_size):
if not spans:
print("No binary/corrupt spans found. (Entire file appears to be valid text/UTF-8.)")
return
print(f"File size: {total_size:,} bytes")
print(f"Found {len(spans)} suspicious span(s):\n")
for i, sp in enumerate(spans, 1):
start = sp['start']
end = sp['end']
length = sp['length']
reasons = ','.join(sp['reasons'])
# Example dd/tail helpers
dd_cmd = f"dd if=YOURFILE of=/dev/stdout bs=1 skip={start} count={min(length, 4096)} status=none | hexdump -C"
tail_cmd = f"tail -c +{end+1} YOURFILE | tail -n 200"
print(f"[{i}] start={start:,} end={end:,} length={length:,} bytes reasons=[{reasons}]")
print(f" Inspect bytes: {dd_cmd}")
print(f" Tail after span: {tail_cmd}")
# summary by reason
counts = {}
for sp in spans:
for r in sp['reasons']:
counts[r] = counts.get(r, 0) + sp['length']
if counts:
print("\nApprox bytes by reason:")
for r, n in sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])):
print(f" {r:12s} ~ {n:,} bytes")
def main():
ap = argparse.ArgumentParser(description="Find binary/corrupt spans in a large log and report byte offsets.")
ap.add_argument("path", help="Path to the log file")
ap.add_argument("--block-size", default="8MiB", help="Read chunk size (e.g., 4MiB, 8MiB, 16MiB). Default: 8MiB")
ap.add_argument("--merge-gap", default="0", help="Merge spans separated by <= this many bytes (e.g., 64, 1KiB). Default: 0")
ap.add_argument("--min-span", default="1", help="Only report spans of at least this many bytes. Default: 1")
ap.add_argument("--report", choices=["human", "json", "tsv"], default="human", help="Output format")
args = ap.parse_args()
path = args.path
if not os.path.isfile(path):
print(f"error: not a file: {path}", file=sys.stderr)
sys.exit(2)
block_size = parse_size(args.block_size)
merge_gap = parse_size(args.merge_gap)
min_span = parse_size(args.min_span)
spans, total = scan_file(path, block_size, merge_gap, min_span)
if args.report == "human":
print_human(spans, total)
elif args.report == "json":
# Make reasons JSON-friendly
out = [{**sp, "reasons": sp["reasons"]} for sp in spans]
print(json.dumps({"file_size": total, "spans": out}, indent=2))
else: # tsv
print("start\tend\tlength\treasons")
for sp in spans:
print(f"{sp['start']}\t{sp['end']}\t{sp['length']}\t{','.join(sp['reasons'])}")
if __name__ == "__main__":
main()
'';
in
{
packages = forAllSystems (system:
let
pkgs = import nixpkgs { inherit system; };
pyFile = pkgs.writeText "find_bad_spans.py" pySource;
appPkg = pkgs.writeShellApplication {
name = "find-bad-spans";
runtimeInputs = [ pkgs.python3 ];
text = ''
exec ${pkgs.python3}/bin/python3 -u ${pyFile} "$@"
'';
};
in {
default = appPkg;
find-bad-spans = appPkg;
});
apps = forAllSystems (system: {
default = {
type = "app";
program = "${self.packages.${system}.default}/bin/find-bad-spans";
};
});
# Optional dev shell (brings python + coreutils/hexdump for manual poking)
devShells = forAllSystems (system:
let pkgs = import nixpkgs { inherit system; };
in {
default = pkgs.mkShell {
packages = [ pkgs.python3 pkgs.coreutils pkgs.util-linux ];
};
});
};
}
k cemuz jsem dospel takhle: https://chatgpt.com/share/69161a14-95a4-800b-b64d-3bee9259da5d
cili ZFS je pro jednou uplne bez viny 