More work on extraction, especially for regular files

This commit is contained in:
Caleb Gardner
2026-05-08 06:06:33 -05:00
parent 5521b2ce6a
commit d76b164e45
8 changed files with 417 additions and 14 deletions
+25 -3
View File
@@ -8,7 +8,9 @@ const DirEntry = @import("directory.zig");
const ExtractionOptions = @import("options.zig");
const Inode = @import("inode.zig");
const DataExtractor = @import("util/data_extractor.zig");
const Decompressor = @import("util/decompressor.zig");
const MetadataReader = @import("util/metadata.zig");
const SharedCache = @import("util/shared_cache.zig");
const File = @This();
@@ -83,16 +85,36 @@ pub fn open(self: File, alloc: std.mem.Allocator, io: Io, filepath: []const u8)
}
pub fn extract(self: File, alloc: std.mem.Allocator, io: Io, filepath: []const u8, options: ExtractionOptions) !void {
var cache: SharedCache = try .init(alloc, 10); // TODO: calculate a good initial cache size.
defer cache.deinit();
var decomp = switch (self.archive.super.compression) {
.gzip => {},
.lzma => {},
.xz => {},
.zstd => {},
else => unreachable,
};
return self.extractReal(alloc, io, &cache, &decomp.interface, filepath, options);
}
fn extractReal(self: File, alloc: std.mem.Allocator, io: Io, cache: *SharedCache, decomp: *const Decompressor, filepath: []const u8, options: ExtractionOptions) !void {
_ = options;
switch (self.inode.hdr.inode_type) {
.file, .ext_file => {
var ext = try self.inode.dataExtractor(
self.archive.file,
cache,
decomp,
self.archive.super.block_size,
);
var atomic_file = try Io.Dir.cwd().createFileAtomic(io, filepath, .{});
defer atomic_file.deinit(io);
try ext.extract(alloc, io, atomic_file.file);
try atomic_file.link(io);
},
else => return error.TODO,
}
_ = alloc;
_ = options;
return error.TODO;
}
// Types
+36
View File
@@ -8,9 +8,12 @@ const DirEntry = @import("directory.zig");
const dir = @import("inode_data/dir.zig");
const file = @import("inode_data/file.zig");
const misc = @import("inode_data/misc.zig");
const DataExtractor = @import("util/data_extractor.zig");
const DataReader = @import("util/data_reader.zig");
const Decompressor = @import("util/decompressor.zig");
const MetadataReader = @import("util/metadata.zig");
const OffsetFile = @import("util/offset_file.zig");
const SharedCache = @import("util/shared_cache.zig");
const Inode = @This();
@@ -52,6 +55,7 @@ pub fn deinit(self: Inode, alloc: std.mem.Allocator) void {
// Utility Functions
/// Read the directory entries
pub fn readDirectory(self: Inode, alloc: std.mem.Allocator, io: Io, fil: OffsetFile, decomp: *const Decompressor, dir_offset: u64) ![]DirEntry {
return switch (self.data) {
.dir => |d| readDirFromData(alloc, io, fil, decomp, dir_offset, d),
@@ -66,6 +70,38 @@ fn readDirFromData(alloc: std.mem.Allocator, io: Io, fil: OffsetFile, decomp: *c
return DirEntry.readDirectory(alloc, &meta.interface, d.size);
}
/// Get a reader for a regular file's data.
pub fn dataReader(self: Inode, alloc: std.mem.Allocator, io: Io, fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32) !DataReader {
return switch (self.data) {
.file => |f| getReaderFromData(alloc, io, fil, cache, decomp, block_size, f),
.ext_file => |f| getReaderFromData(alloc, io, fil, cache, decomp, block_size, f),
else => Error.NotRegularFile,
};
}
fn getReaderFromData(alloc: std.mem.Allocator, io: Io, fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32, d: anytype) !DataReader {
const ext: DataReader = .init(alloc, io, fil, cache, decomp, block_size, d.size, d.block_start, d.blocks);
if (d.frag_block_offset == 0xFFFFFFFF) {
// TODO:
return error.TODO;
}
return ext;
}
/// Get an extractor for a regular file's data.
pub fn dataExtractor(self: Inode, fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32) !DataExtractor {
return switch (self.data) {
.file => |f| getExtractorFromData(fil, cache, decomp, block_size, f),
.ext_file => |f| getExtractorFromData(fil, cache, decomp, block_size, f),
else => Error.NotRegularFile,
};
}
fn getExtractorFromData(fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32, d: anytype) !DataExtractor {
const ext: DataExtractor = .init(fil, cache, decomp, block_size, d.size, d.block_start, d.blocks);
if (d.frag_block_offset == 0xFFFFFFFF) {
// TODO:
return error.TODO;
}
return ext;
}
// Types
+97
View File
@@ -3,16 +3,113 @@
const std = @import("std");
const Io = std.Io;
const FragEntry = @import("../frag.zig").FragEntry;
const BlockSize = @import("../inode_data/file.zig").BlockSize;
const Decompressor = @import("decompressor.zig");
const OffsetFile = @import("offset_file.zig");
const SharedCache = @import("shared_cache.zig");
const DataExtractor = @This();
fil: OffsetFile,
cache: *SharedCache,
decomp: *const Decompressor,
block_size: u32,
file_size: u64,
start: u64,
blocks: []BlockSize,
frag_offset: u32 = 0,
frag_entry: ?FragEntry = null,
pub fn init(fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32, file_size: u64, data_start: u64, blocks: []BlockSize) DataExtractor {
return .{
.fil = fil,
.cache = cache,
.decomp = decomp,
.block_size = block_size,
.file_size = file_size,
.start = data_start,
.blocks = blocks,
};
}
pub fn addFrag(self: *DataExtractor, frag_offset: u32, entry: FragEntry) void {
self.frag_offset = frag_offset;
self.frag_entry = entry;
}
fn numBlocks(self: DataExtractor) usize {
var num = self.blocks.len;
if (self.frag_entry != null) num += 1;
return num;
}
pub fn extract(self: DataExtractor, alloc: std.mem.Allocator, io: Io, fil: Io.File) !void {
_ = self;
_ = alloc;
_ = io;
_ = fil;
}
fn blockThread(self: DataExtractor, alloc: std.mem.Allocator, io: Io, fil: Io.File, read_offset: u64, offset: u64, idx: u32) !void {
const block = self.blocks[idx];
const cur_block_size = if (idx == self.numBlocks() - 1)
self.file_size % self.block_size
else
self.block_size;
var wrt = fil.writer(io, &[0]u8{});
try wrt.seekTo(offset);
defer wrt.flush() catch {};
if (block.size == 0) {
try wrt.interface.splatByteAll(0, cur_block_size);
return;
}
var rdr = try self.fil.readerAt(io, read_offset, &[0]u8{});
if (block.uncompressed) {
try rdr.interface.streamExact(&wrt, cur_block_size);
return;
} else {
@branchHint(.likely);
var cache = try self.cache.getCache(io);
defer self.cache.returnCache(cache);
var tmp = try self.cache.getCache(io);
defer self.cache.returnCache(tmp);
try rdr.interface.readSliceAll(cache.cache[0..block.size]);
_ = try self.decomp.Decompress(alloc, cache.cache[0..block.size], tmp.cache[0..cur_block_size]);
try wrt.interface.writeAll(tmp.cache[0..cur_block_size]);
}
}
fn fragThread(self: DataExtractor, alloc: std.mem.Allocator, io: Io, fil: Io.File, offset: u64) !void {
const frag = self.frag_entry.?;
const cur_block_size = self.file_size % self.block_size;
var wrt = fil.writer(io, &[0]u8{});
try wrt.seekTo(offset);
defer wrt.flush() catch {};
var rdr = try self.fil.readerAt(io, frag.start, &[0]u8{});
if (frag.size.uncompressed) {
try rdr.interface.discardAll(self.frag_offset);
try rdr.interface.streamExact(&wrt, cur_block_size);
return;
} else {
@branchHint(.likely);
var cache = try self.cache.getCache(io);
defer self.cache.returnCache(cache);
var tmp = try self.cache.getCache(io);
defer self.cache.returnCache(tmp);
try rdr.interface.readSliceAll(cache.cache[0..frag.size.size]);
_ = try self.decomp.Decompress(alloc, cache.cache[0..frag.size.size], tmp.cache[0..self.block_size]);
try wrt.interface.writeAll(tmp.cache[0..cur_block_size]);
}
}
+190 -1
View File
@@ -2,14 +2,23 @@
const std = @import("std");
const Io = std.Io;
const Reader = Io.Reader;
const Writer = Io.Writer;
const Limit = Io.Limit;
const FragEntry = @import("../frag.zig").FragEntry;
const BlockSize = @import("../inode_data/file.zig").BlockSize;
const Decompressor = @import("decompressor.zig");
const OffsetFile = @import("offset_file.zig");
const SharedCache = @import("shared_cache.zig");
const DataExtractor = @This();
const DataReader = @This();
alloc: std.mem.Allocator,
fil: OffsetFile,
io: Io,
cache: *SharedCache,
decomp: *const Decompressor,
block_size: u32,
@@ -17,4 +26,184 @@ file_size: u64,
cur_offset: u64,
blocks: []BlockSize,
frag_offset: u32 = 0,
frag_entry: ?FragEntry = null,
block_idx: usize = 0,
sparse_block: bool = false,
interface: Io.Reader,
pub fn init(alloc: std.mem.Allocator, io: Io, fil: OffsetFile, cache: *SharedCache, decomp: *const Decompressor, block_size: u32, file_size: u64, data_start: u64, blocks: []BlockSize) !DataReader {
return .{
.alloc = alloc,
.fil = fil,
.io = io,
.decomp = decomp,
.block_size = block_size,
.file_size = file_size,
.cur_offset = data_start,
.blocks = blocks,
.interface = .{
.buffer = try cache.getCache(io),
.seek = 0,
.end = 0,
.vtable = &.{
.stream = stream,
.discard = discard,
.readVec = readVec,
},
},
};
}
pub fn deinit(self: *DataReader) void {
if (self.interface.buffer.len > 0) {
const buf_nod: *SharedCache.BufferNode = @fieldParentPtr("cache", self.interface.buffer);
self.cache.returnCache(buf_nod);
}
}
pub fn addFrag(self: *DataReader, frag_offset: u32, entry: FragEntry) void {
self.frag_offset = frag_offset;
self.frag_entry = entry;
}
fn numBlocks(self: DataReader) usize {
var num = self.blocks.len;
if (self.frag_entry != null) num += 1;
return num;
}
fn advanceBuffer(self: *DataReader) !void {
if (self.block_idx >= self.numBlocks()) {
return Reader.Error.EndOfStream;
}
defer self.block_idx += 1;
self.interface.end = if (self.block_idx == self.numBlocks() - 1)
self.size % self.block_size
else
self.block_size;
// Fragment
if (self.block_idx == self.blocks.len) {
const entry = self.frag_entry.?;
if (entry.size.uncompressed) {
var rdr = try self.fil.readerAt(self.io, entry.start + self.frag_offset, &[0]u8{});
try rdr.interface.readSliceAll(self.interface.buffer[0..self.interface.end]);
} else {
@branchHint(.likely);
const tmp = try self.cache.getCache(self.io);
defer self.cache.returnCache(tmp);
var rdr = try self.fil.readerAt(self.io, entry.start, &[0]u8{});
try rdr.interface.readSliceAll(tmp.cache[0..entry.size.size]);
_ = try self.decomp.Decompress(self.alloc, tmp.cache[0..entry.size.size], self.interface.buffer[0..self.block_size]);
@memmove(self.interface.buffer[0..self.interface.end], self.interface.buffer[self.frag_offset .. self.frag_offset + self.interface.end]);
}
self.interface.seek = 0;
return;
}
// Normal Block
const block = self.blocks[self.block_idx];
if (block.size == 0) {
self.interface.seek = 0;
self.sparse_block = true;
return;
} else {
self.sparse_block = false;
}
if (block.uncompressed) {
try self.fil.readAt(self.io, self.cur_offset, self.interface.buffer[0..self.interface.end]);
self.cur_offset += self.interface.end;
} else {
@branchHint(.likely);
const tmp = try self.cache.getCache(self.io);
defer self.cache.returnCache(tmp);
var rdr = try self.fil.readerAt(self.io, self.cur_offset, &[0]u8{});
try rdr.interface.readSliceAll(tmp.cache[0..block.size]);
self.cur_offset += block.size;
_ = try self.decomp.Decompress(self.alloc, tmp.cache[0..block.size], self.interface.buffer[0..self.interface.end]);
}
self.interface.seek = 0;
}
fn stream(rdr: *Reader, wrt: *Writer, limit: Limit) Reader.StreamError!usize {
var data: *DataReader = @fieldParentPtr("interface", rdr);
if (rdr.seek == rdr.end)
data.advanceBuffer() catch |err| return switch (err) {
error.ReadFailed => error.ReadFailed,
error.EndOfStream => error.EndOfStream,
else => error.ReadFailed,
};
switch (limit) {
.nothing => return 0,
.unlimited => {
const wrote = if (data.sparse_block)
try wrt.splatByte(0, rdr.end - rdr.seek)
else
try wrt.write(rdr.buffer[rdr.seek..rdr.end]);
rdr.seek += wrote;
return wrote;
},
else => {
const to_read = @min(rdr.end - rdr.seek, @intFromEnum(limit));
const wrote = if (data.sparse_block)
try wrt.splatByte(0, to_read)
else
try wrt.write(rdr.buffer[rdr.seek .. rdr.seek + to_read]);
rdr.seek += wrote;
return wrote;
},
}
}
fn discard(rdr: *Reader, limit: Limit) Reader.Error!usize {
var data: *DataReader = @fieldParentPtr("interface", rdr);
if (rdr.seek == rdr.end)
data.advanceBuffer() catch |err| return switch (err) {
error.ReadFailed => error.ReadFailed,
error.EndOfStream => error.EndOfStream,
else => error.ReadFailed,
};
switch (limit) {
.nothing => return 0,
.unlimited => {
const adv = rdr.end - rdr.seek;
rdr.seek = rdr.end;
return adv;
},
else => {
const adv = @min(rdr.end - rdr.seek, @intFromEnum(limit));
rdr.seek += adv;
return adv;
},
}
}
fn readVec(rdr: *Reader, vec: [][]u8) Reader.Error!usize {
var data: *DataReader = @fieldParentPtr("interface", rdr);
if (rdr.seek == rdr.end)
data.advanceBuffer() catch |err| return switch (err) {
error.ReadFailed => error.ReadFailed,
error.EndOfStream => error.EndOfStream,
else => error.ReadFailed,
};
var wrote: usize = 0;
for (vec) |buf| {
if (rdr.seek == rdr.end) break;
const to_copy = @min(rdr.end - rdr.seek, buf.len);
if (data.sparse_block)
@memset(buf[0..to_copy], 0)
else
@memcpy(buf[0..to_copy], rdr.buffer[rdr.seek .. rdr.seek + to_copy]);
rdr.seek += to_copy;
wrote += to_copy;
}
return wrote;
}
-4
View File
@@ -13,7 +13,3 @@ decomp_fn: *const fn (?*const Decompressor, std.mem.Allocator, in: []u8, out: []
pub fn Decompress(self: *const Decompressor, alloc: std.mem.Allocator, in: []u8, out: []u8) Error!usize {
return self.decomp_fn(self, alloc, in, out);
}
pub fn StatelessDecompression(self: Decompressor, alloc: std.mem.Allocator, in: []u8, out: []u8) Error!usize {
return self.decomp_fn(null, alloc, in, out);
}
+3 -1
View File
@@ -49,12 +49,14 @@ fn advance(self: *This) !void {
self.interface.end = hdr.size;
self.interface.buffer = self.buf[0..hdr.size];
return;
}
} else {
@branchHint(.likely);
var tmp_buf: [8192]u8 = undefined;
try self.rdr.readSliceAll(tmp_buf[0..hdr.size]);
self.interface.end = try self.decomp.Decompress(self.alloc, tmp_buf[0..hdr.size], &self.buf);
self.interface.buffer = self.buf[0..self.interface.end];
}
}
fn stream(rdr: *Reader, wrt: *Writer, limit: Limit) StreamError!usize {
const self: *This = @fieldParentPtr("interface", rdr);
+11 -2
View File
@@ -1,7 +1,8 @@
//! A File where it's meaningful (to us) content starts at a given offset.
const std = @import("std");
const File = std.Io.File;
const Io = std.Io;
const File = Io.File;
const Reader = File.Reader;
const OffsetFile = @This();
@@ -13,8 +14,16 @@ pub fn init(fil: File, init_offset: u64) OffsetFile {
return .{ .fil = fil, .offset = init_offset };
}
pub fn readerAt(self: OffsetFile, io: std.Io, offset: u64, buffer: []u8) !Reader {
pub fn readerAt(self: OffsetFile, io: Io, offset: u64, buffer: []u8) !Reader {
var rdr = self.fil.reader(io, buffer);
try rdr.seekTo(self.offset + offset);
return rdr;
}
pub fn readAt(self: OffsetFile, io: Io, offset: u64, buf: []u8) !void {
_ = try self.fil.readPositionalAll(io, buf, self.offset + offset);
}
pub fn readValueAt(self: OffsetFile, comptime T: anytype, io: Io, offset: u64) !void {
//TODO: check for endianess and decode accordingly.
var new: T = undefined;
_ = try self.fil.readPositionalAll(io, @ptrCast(&new), self.offset + offset);
}
+52
View File
@@ -0,0 +1,52 @@
const std = @import("std");
const Io = std.Io;
const Node = std.SinglyLinkedList.Node;
const SharedCache = @This();
pub const CACHE_SIZE = 1024 * 1024;
pub const BufferNode = struct {
node: Node,
cache: [CACHE_SIZE]u8,
};
alloc: std.mem.Allocator,
caches: std.ArrayList(BufferNode),
cache_queue: std.SinglyLinkedList,
queue_mut: Io.Mutex,
pub fn init(alloc: std.mem.Allocator, init_cache_size: u32) !SharedCache {
const caches: std.ArrayList(BufferNode) = try .initCapacity(alloc, init_cache_size);
var queue: std.SinglyLinkedList = .{};
for (caches.items) |item|
queue.prepend(&item.node);
return .{
.alloc = alloc,
.caches = caches,
.cache_queue = queue,
};
}
pub fn deinit(self: *SharedCache) void {
self.caches.deinit(self.alloc);
}
pub fn getCache(self: *SharedCache, io: Io) !*BufferNode {
self.queue_mut.lock(io);
const nxt = self.cache_queue.popFirst();
self.queue_mut.unlock(io);
if (nxt == null) {
const new = try self.caches.addOne(self.alloc);
new.* = .{
.node = .{},
.cache = undefined,
};
return new;
}
return @fieldParentPtr("node", nxt.?);
}
pub fn returnCache(self: *SharedCache, buf: *BufferNode) void {
self.cache_queue.prepend(buf);
}