From d76b164e452963ffd56d67c6ce24a538a9f60631 Mon Sep 17 00:00:00 2001 From: Caleb Gardner Date: Fri, 8 May 2026 06:06:33 -0500 Subject: [PATCH] More work on extraction, especially for regular files --- src/file.zig | 28 +++++- src/inode.zig | 36 +++++++ src/util/data_extractor.zig | 97 ++++++++++++++++++ src/util/data_reader.zig | 191 +++++++++++++++++++++++++++++++++++- src/util/decompressor.zig | 4 - src/util/metadata.zig | 10 +- src/util/offset_file.zig | 13 ++- src/util/shared_cache.zig | 52 ++++++++++ 8 files changed, 417 insertions(+), 14 deletions(-) create mode 100644 src/util/shared_cache.zig diff --git a/src/file.zig b/src/file.zig index b54dd87..bd5fa5b 100644 --- a/src/file.zig +++ b/src/file.zig @@ -8,7 +8,9 @@ const DirEntry = @import("directory.zig"); const ExtractionOptions = @import("options.zig"); const Inode = @import("inode.zig"); const DataExtractor = @import("util/data_extractor.zig"); +const Decompressor = @import("util/decompressor.zig"); const MetadataReader = @import("util/metadata.zig"); +const SharedCache = @import("util/shared_cache.zig"); const File = @This(); @@ -83,16 +85,36 @@ pub fn open(self: File, alloc: std.mem.Allocator, io: Io, filepath: []const u8) } pub fn extract(self: File, alloc: std.mem.Allocator, io: Io, filepath: []const u8, options: ExtractionOptions) !void { + var cache: SharedCache = try .init(alloc, 10); // TODO: calculate a good initial cache size. + defer cache.deinit(); + var decomp = switch (self.archive.super.compression) { + .gzip => {}, + .lzma => {}, + .xz => {}, + .zstd => {}, + else => unreachable, + }; + return self.extractReal(alloc, io, &cache, &decomp.interface, filepath, options); +} +fn extractReal(self: File, alloc: std.mem.Allocator, io: Io, cache: *SharedCache, decomp: *const Decompressor, filepath: []const u8, options: ExtractionOptions) !void { + _ = options; switch (self.inode.hdr.inode_type) { .file, .ext_file => { + var ext = try self.inode.dataExtractor( + self.archive.file, + cache, + decomp, + self.archive.super.block_size, + ); + var atomic_file = try Io.Dir.cwd().createFileAtomic(io, filepath, .{}); defer atomic_file.deinit(io); + + try ext.extract(alloc, io, atomic_file.file); + try atomic_file.link(io); }, else => return error.TODO, } - _ = alloc; - _ = options; - return error.TODO; } // Types diff --git a/src/inode.zig b/src/inode.zig index 3a79707..1976196 100644 --- a/src/inode.zig +++ b/src/inode.zig @@ -8,9 +8,12 @@ const DirEntry = @import("directory.zig"); const dir = @import("inode_data/dir.zig"); const file = @import("inode_data/file.zig"); const misc = @import("inode_data/misc.zig"); +const DataExtractor = @import("util/data_extractor.zig"); +const DataReader = @import("util/data_reader.zig"); const Decompressor = @import("util/decompressor.zig"); const MetadataReader = @import("util/metadata.zig"); const OffsetFile = @import("util/offset_file.zig"); +const SharedCache = @import("util/shared_cache.zig"); const Inode = @This(); @@ -52,6 +55,7 @@ pub fn deinit(self: Inode, alloc: std.mem.Allocator) void { // Utility Functions +/// Read the directory entries pub fn readDirectory(self: Inode, alloc: std.mem.Allocator, io: Io, fil: OffsetFile, decomp: *const Decompressor, dir_offset: u64) ![]DirEntry { return switch (self.data) { .dir => |d| readDirFromData(alloc, io, fil, decomp, dir_offset, d), @@ -66,6 +70,38 @@ fn readDirFromData(alloc: std.mem.Allocator, io: Io, fil: OffsetFile, decomp: *c return DirEntry.readDirectory(alloc, &meta.interface, d.size); } +/// Get a reader for a regular file's data. +pub fn dataReader(self: Inode, alloc: std.mem.Allocator, io: Io, fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32) !DataReader { + return switch (self.data) { + .file => |f| getReaderFromData(alloc, io, fil, cache, decomp, block_size, f), + .ext_file => |f| getReaderFromData(alloc, io, fil, cache, decomp, block_size, f), + else => Error.NotRegularFile, + }; +} +fn getReaderFromData(alloc: std.mem.Allocator, io: Io, fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32, d: anytype) !DataReader { + const ext: DataReader = .init(alloc, io, fil, cache, decomp, block_size, d.size, d.block_start, d.blocks); + if (d.frag_block_offset == 0xFFFFFFFF) { + // TODO: + return error.TODO; + } + return ext; +} +/// Get an extractor for a regular file's data. +pub fn dataExtractor(self: Inode, fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32) !DataExtractor { + return switch (self.data) { + .file => |f| getExtractorFromData(fil, cache, decomp, block_size, f), + .ext_file => |f| getExtractorFromData(fil, cache, decomp, block_size, f), + else => Error.NotRegularFile, + }; +} +fn getExtractorFromData(fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32, d: anytype) !DataExtractor { + const ext: DataExtractor = .init(fil, cache, decomp, block_size, d.size, d.block_start, d.blocks); + if (d.frag_block_offset == 0xFFFFFFFF) { + // TODO: + return error.TODO; + } + return ext; +} // Types diff --git a/src/util/data_extractor.zig b/src/util/data_extractor.zig index 51d2e9f..634474e 100644 --- a/src/util/data_extractor.zig +++ b/src/util/data_extractor.zig @@ -3,16 +3,113 @@ const std = @import("std"); const Io = std.Io; +const FragEntry = @import("../frag.zig").FragEntry; const BlockSize = @import("../inode_data/file.zig").BlockSize; const Decompressor = @import("decompressor.zig"); const OffsetFile = @import("offset_file.zig"); +const SharedCache = @import("shared_cache.zig"); const DataExtractor = @This(); fil: OffsetFile, +cache: *SharedCache, decomp: *const Decompressor, block_size: u32, file_size: u64, start: u64, blocks: []BlockSize, + +frag_offset: u32 = 0, +frag_entry: ?FragEntry = null, + +pub fn init(fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32, file_size: u64, data_start: u64, blocks: []BlockSize) DataExtractor { + return .{ + .fil = fil, + .cache = cache, + .decomp = decomp, + .block_size = block_size, + + .file_size = file_size, + .start = data_start, + .blocks = blocks, + }; +} +pub fn addFrag(self: *DataExtractor, frag_offset: u32, entry: FragEntry) void { + self.frag_offset = frag_offset; + self.frag_entry = entry; +} + +fn numBlocks(self: DataExtractor) usize { + var num = self.blocks.len; + if (self.frag_entry != null) num += 1; + return num; +} + +pub fn extract(self: DataExtractor, alloc: std.mem.Allocator, io: Io, fil: Io.File) !void { + _ = self; + _ = alloc; + _ = io; + _ = fil; +} + +fn blockThread(self: DataExtractor, alloc: std.mem.Allocator, io: Io, fil: Io.File, read_offset: u64, offset: u64, idx: u32) !void { + const block = self.blocks[idx]; + + const cur_block_size = if (idx == self.numBlocks() - 1) + self.file_size % self.block_size + else + self.block_size; + + var wrt = fil.writer(io, &[0]u8{}); + try wrt.seekTo(offset); + defer wrt.flush() catch {}; + + if (block.size == 0) { + try wrt.interface.splatByteAll(0, cur_block_size); + return; + } + + var rdr = try self.fil.readerAt(io, read_offset, &[0]u8{}); + if (block.uncompressed) { + try rdr.interface.streamExact(&wrt, cur_block_size); + return; + } else { + @branchHint(.likely); + var cache = try self.cache.getCache(io); + defer self.cache.returnCache(cache); + + var tmp = try self.cache.getCache(io); + defer self.cache.returnCache(tmp); + + try rdr.interface.readSliceAll(cache.cache[0..block.size]); + _ = try self.decomp.Decompress(alloc, cache.cache[0..block.size], tmp.cache[0..cur_block_size]); + try wrt.interface.writeAll(tmp.cache[0..cur_block_size]); + } +} +fn fragThread(self: DataExtractor, alloc: std.mem.Allocator, io: Io, fil: Io.File, offset: u64) !void { + const frag = self.frag_entry.?; + const cur_block_size = self.file_size % self.block_size; + + var wrt = fil.writer(io, &[0]u8{}); + try wrt.seekTo(offset); + defer wrt.flush() catch {}; + + var rdr = try self.fil.readerAt(io, frag.start, &[0]u8{}); + if (frag.size.uncompressed) { + try rdr.interface.discardAll(self.frag_offset); + try rdr.interface.streamExact(&wrt, cur_block_size); + return; + } else { + @branchHint(.likely); + var cache = try self.cache.getCache(io); + defer self.cache.returnCache(cache); + + var tmp = try self.cache.getCache(io); + defer self.cache.returnCache(tmp); + + try rdr.interface.readSliceAll(cache.cache[0..frag.size.size]); + _ = try self.decomp.Decompress(alloc, cache.cache[0..frag.size.size], tmp.cache[0..self.block_size]); + try wrt.interface.writeAll(tmp.cache[0..cur_block_size]); + } +} diff --git a/src/util/data_reader.zig b/src/util/data_reader.zig index 15831f8..59d3adc 100644 --- a/src/util/data_reader.zig +++ b/src/util/data_reader.zig @@ -2,14 +2,23 @@ const std = @import("std"); const Io = std.Io; +const Reader = Io.Reader; +const Writer = Io.Writer; +const Limit = Io.Limit; +const FragEntry = @import("../frag.zig").FragEntry; const BlockSize = @import("../inode_data/file.zig").BlockSize; const Decompressor = @import("decompressor.zig"); const OffsetFile = @import("offset_file.zig"); +const SharedCache = @import("shared_cache.zig"); -const DataExtractor = @This(); +const DataReader = @This(); + +alloc: std.mem.Allocator, fil: OffsetFile, +io: Io, +cache: *SharedCache, decomp: *const Decompressor, block_size: u32, @@ -17,4 +26,184 @@ file_size: u64, cur_offset: u64, blocks: []BlockSize, +frag_offset: u32 = 0, +frag_entry: ?FragEntry = null, + +block_idx: usize = 0, +sparse_block: bool = false, + interface: Io.Reader, + +pub fn init(alloc: std.mem.Allocator, io: Io, fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32, file_size: u64, data_start: u64, blocks: []BlockSize) !DataReader { + return .{ + .alloc = alloc, + + .fil = fil, + .io = io, + .decomp = decomp, + .block_size = block_size, + + .file_size = file_size, + .cur_offset = data_start, + .blocks = blocks, + + .interface = .{ + .buffer = try cache.getCache(io), + .seek = 0, + .end = 0, + .vtable = &.{ + .stream = stream, + .discard = discard, + .readVec = readVec, + }, + }, + }; +} +pub fn deinit(self: *DataReader) void { + if (self.interface.buffer.len > 0) { + const buf_nod: *SharedCache.BufferNode = @fieldParentPtr("cache", self.interface.buffer); + self.cache.returnCache(buf_nod); + } +} +pub fn addFrag(self: *DataReader, frag_offset: u32, entry: FragEntry) void { + self.frag_offset = frag_offset; + self.frag_entry = entry; +} + +fn numBlocks(self: DataReader) usize { + var num = self.blocks.len; + if (self.frag_entry != null) num += 1; + return num; +} +fn advanceBuffer(self: *DataReader) !void { + if (self.block_idx >= self.numBlocks()) { + return Reader.Error.EndOfStream; + } + defer self.block_idx += 1; + + self.interface.end = if (self.block_idx == self.numBlocks() - 1) + self.size % self.block_size + else + self.block_size; + + // Fragment + if (self.block_idx == self.blocks.len) { + const entry = self.frag_entry.?; + if (entry.size.uncompressed) { + var rdr = try self.fil.readerAt(self.io, entry.start + self.frag_offset, &[0]u8{}); + try rdr.interface.readSliceAll(self.interface.buffer[0..self.interface.end]); + } else { + @branchHint(.likely); + const tmp = try self.cache.getCache(self.io); + defer self.cache.returnCache(tmp); + + var rdr = try self.fil.readerAt(self.io, entry.start, &[0]u8{}); + try rdr.interface.readSliceAll(tmp.cache[0..entry.size.size]); + _ = try self.decomp.Decompress(self.alloc, tmp.cache[0..entry.size.size], self.interface.buffer[0..self.block_size]); + @memmove(self.interface.buffer[0..self.interface.end], self.interface.buffer[self.frag_offset .. self.frag_offset + self.interface.end]); + } + self.interface.seek = 0; + return; + } + + // Normal Block + const block = self.blocks[self.block_idx]; + if (block.size == 0) { + self.interface.seek = 0; + self.sparse_block = true; + return; + } else { + self.sparse_block = false; + } + if (block.uncompressed) { + try self.fil.readAt(self.io, self.cur_offset, self.interface.buffer[0..self.interface.end]); + self.cur_offset += self.interface.end; + } else { + @branchHint(.likely); + const tmp = try self.cache.getCache(self.io); + defer self.cache.returnCache(tmp); + + var rdr = try self.fil.readerAt(self.io, self.cur_offset, &[0]u8{}); + try rdr.interface.readSliceAll(tmp.cache[0..block.size]); + self.cur_offset += block.size; + _ = try self.decomp.Decompress(self.alloc, tmp.cache[0..block.size], self.interface.buffer[0..self.interface.end]); + } + self.interface.seek = 0; +} + +fn stream(rdr: *Reader, wrt: *Writer, limit: Limit) Reader.StreamError!usize { + var data: *DataReader = @fieldParentPtr("interface", rdr); + if (rdr.seek == rdr.end) + data.advanceBuffer() catch |err| return switch (err) { + error.ReadFailed => error.ReadFailed, + error.EndOfStream => error.EndOfStream, + else => error.ReadFailed, + }; + + switch (limit) { + .nothing => return 0, + .unlimited => { + const wrote = if (data.sparse_block) + try wrt.splatByte(0, rdr.end - rdr.seek) + else + try wrt.write(rdr.buffer[rdr.seek..rdr.end]); + rdr.seek += wrote; + return wrote; + }, + else => { + const to_read = @min(rdr.end - rdr.seek, @intFromEnum(limit)); + const wrote = if (data.sparse_block) + try wrt.splatByte(0, to_read) + else + try wrt.write(rdr.buffer[rdr.seek .. rdr.seek + to_read]); + rdr.seek += wrote; + return wrote; + }, + } +} +fn discard(rdr: *Reader, limit: Limit) Reader.Error!usize { + var data: *DataReader = @fieldParentPtr("interface", rdr); + if (rdr.seek == rdr.end) + data.advanceBuffer() catch |err| return switch (err) { + error.ReadFailed => error.ReadFailed, + error.EndOfStream => error.EndOfStream, + else => error.ReadFailed, + }; + + switch (limit) { + .nothing => return 0, + .unlimited => { + const adv = rdr.end - rdr.seek; + rdr.seek = rdr.end; + return adv; + }, + else => { + const adv = @min(rdr.end - rdr.seek, @intFromEnum(limit)); + rdr.seek += adv; + return adv; + }, + } +} +fn readVec(rdr: *Reader, vec: [][]u8) Reader.Error!usize { + var data: *DataReader = @fieldParentPtr("interface", rdr); + if (rdr.seek == rdr.end) + data.advanceBuffer() catch |err| return switch (err) { + error.ReadFailed => error.ReadFailed, + error.EndOfStream => error.EndOfStream, + else => error.ReadFailed, + }; + + var wrote: usize = 0; + for (vec) |buf| { + if (rdr.seek == rdr.end) break; + + const to_copy = @min(rdr.end - rdr.seek, buf.len); + if (data.sparse_block) + @memset(buf[0..to_copy], 0) + else + @memcpy(buf[0..to_copy], rdr.buffer[rdr.seek .. rdr.seek + to_copy]); + rdr.seek += to_copy; + wrote += to_copy; + } + return wrote; +} diff --git a/src/util/decompressor.zig b/src/util/decompressor.zig index ff4a491..3450513 100644 --- a/src/util/decompressor.zig +++ b/src/util/decompressor.zig @@ -13,7 +13,3 @@ decomp_fn: *const fn (?*const Decompressor, std.mem.Allocator, in: []u8, out: [] pub fn Decompress(self: *const Decompressor, alloc: std.mem.Allocator, in: []u8, out: []u8) Error!usize { return self.decomp_fn(self, alloc, in, out); } - -pub fn StatelessDecompression(self: Decompressor, alloc: std.mem.Allocator, in: []u8, out: []u8) Error!usize { - return self.decomp_fn(null, alloc, in, out); -} diff --git a/src/util/metadata.zig b/src/util/metadata.zig index 285127f..485c65c 100644 --- a/src/util/metadata.zig +++ b/src/util/metadata.zig @@ -49,11 +49,13 @@ fn advance(self: *This) !void { self.interface.end = hdr.size; self.interface.buffer = self.buf[0..hdr.size]; return; + } else { + @branchHint(.likely); + var tmp_buf: [8192]u8 = undefined; + try self.rdr.readSliceAll(tmp_buf[0..hdr.size]); + self.interface.end = try self.decomp.Decompress(self.alloc, tmp_buf[0..hdr.size], &self.buf); + self.interface.buffer = self.buf[0..self.interface.end]; } - var tmp_buf: [8192]u8 = undefined; - try self.rdr.readSliceAll(tmp_buf[0..hdr.size]); - self.interface.end = try self.decomp.Decompress(self.alloc, tmp_buf[0..hdr.size], &self.buf); - self.interface.buffer = self.buf[0..self.interface.end]; } fn stream(rdr: *Reader, wrt: *Writer, limit: Limit) StreamError!usize { diff --git a/src/util/offset_file.zig b/src/util/offset_file.zig index 2935a47..84c6588 100644 --- a/src/util/offset_file.zig +++ b/src/util/offset_file.zig @@ -1,7 +1,8 @@ //! A File where it's meaningful (to us) content starts at a given offset. const std = @import("std"); -const File = std.Io.File; +const Io = std.Io; +const File = Io.File; const Reader = File.Reader; const OffsetFile = @This(); @@ -13,8 +14,16 @@ pub fn init(fil: File, init_offset: u64) OffsetFile { return .{ .fil = fil, .offset = init_offset }; } -pub fn readerAt(self: OffsetFile, io: std.Io, offset: u64, buffer: []u8) !Reader { +pub fn readerAt(self: OffsetFile, io: Io, offset: u64, buffer: []u8) !Reader { var rdr = self.fil.reader(io, buffer); try rdr.seekTo(self.offset + offset); return rdr; } +pub fn readAt(self: OffsetFile, io: Io, offset: u64, buf: []u8) !void { + _ = try self.fil.readPositionalAll(io, buf, self.offset + offset); +} +pub fn readValueAt(self: OffsetFile, comptime T: anytype, io: Io, offset: u64) !void { + //TODO: check for endianess and decode accordingly. + var new: T = undefined; + _ = try self.fil.readPositionalAll(io, @ptrCast(&new), self.offset + offset); +} diff --git a/src/util/shared_cache.zig b/src/util/shared_cache.zig new file mode 100644 index 0000000..c081489 --- /dev/null +++ b/src/util/shared_cache.zig @@ -0,0 +1,52 @@ +const std = @import("std"); +const Io = std.Io; +const Node = std.SinglyLinkedList.Node; + +const SharedCache = @This(); + +pub const CACHE_SIZE = 1024 * 1024; + +pub const BufferNode = struct { + node: Node, + cache: [CACHE_SIZE]u8, +}; + +alloc: std.mem.Allocator, + +caches: std.ArrayList(BufferNode), +cache_queue: std.SinglyLinkedList, +queue_mut: Io.Mutex, + +pub fn init(alloc: std.mem.Allocator, init_cache_size: u32) !SharedCache { + const caches: std.ArrayList(BufferNode) = try .initCapacity(alloc, init_cache_size); + var queue: std.SinglyLinkedList = .{}; + for (caches.items) |item| + queue.prepend(&item.node); + return .{ + .alloc = alloc, + + .caches = caches, + .cache_queue = queue, + }; +} +pub fn deinit(self: *SharedCache) void { + self.caches.deinit(self.alloc); +} + +pub fn getCache(self: *SharedCache, io: Io) !*BufferNode { + self.queue_mut.lock(io); + const nxt = self.cache_queue.popFirst(); + self.queue_mut.unlock(io); + if (nxt == null) { + const new = try self.caches.addOne(self.alloc); + new.* = .{ + .node = .{}, + .cache = undefined, + }; + return new; + } + return @fieldParentPtr("node", nxt.?); +} +pub fn returnCache(self: *SharedCache, buf: *BufferNode) void { + self.cache_queue.prepend(buf); +}