GodotPythonJSONRPC/addons/kiripythonrpcwrapper/TARReader.gd

# TARReader
#
# Read .tar.zip files. Interface mostly identical to ZIPReader.
#
# Why .tar.zip instead of .tar.bz2, .tar.gz, .tar.zst, or something normal?
# Godot supports loading files with GZip and Zstandard compression, but only
# files that it's saved (with a header/footer), so it can't load normal .tar.gz
# or .tar.zst files. It can load zips, though.
#
# DO NOT USE THIS ON UNTRUSTED DATA.

extends RefCounted
class_name TARReader

#region Internal data

class TarFileRecord:
	extends RefCounted
	var filename : String
	var offset : int
	var file_size : int
	
	# Unix file permissions.
	#
	# Technically this is an int, but we're just going to leave it as an octal
	# string because that's what we can feed right into chmod.
	var mode : String
	
	# Symlinks.
	var is_link : bool
	var link_destination : String
	
	var is_directory : bool
	
	var type_indicator : String

var _internal_file_list = []
var _reader : ZIPReader = null
var _tar_file_cache : PackedByteArray = []

#endregion

#region Cache wrangling

# We have to load the entire .tar file into memory with the way the ZipReader
# API works, but we'll at least include an option to nuke the cache to free up
# memory if you want to just leave the file open.
#
# This lets us avoid re-opening and decompressing the entire .tar every time we
# need something out of it, while still letting us manually free the memory when
# we won't need it for a while.
func clear_cache():
	_tar_file_cache = []

func load_cache() -> Error:
	assert(_reader)

	if len(_tar_file_cache):
		# Cache already in-memory.
		return OK

	var zip_file_list = _reader.get_files()
	if len(zip_file_list) != 1:
		return ERR_FILE_UNRECOGNIZED

	_tar_file_cache = _reader.read_file(zip_file_list[0])

	return OK

#endregion

func close() -> Error:
	_internal_file_list = []
	_reader.close()
	_reader = null
	clear_cache()
	return OK

func file_exists(path: String, case_sensitive: bool = true) -> bool:
	for record : TarFileRecord in _internal_file_list:
		if case_sensitive:
			if record.filename == path:
				return true
		else:
			if record.filename.nocasecmp_to(path) == 0:
				return true
	return false

func get_files() -> PackedStringArray:
	var ret : PackedStringArray = []
	for record : TarFileRecord in _internal_file_list:
		ret.append(record.filename)
	return ret

func _octal_str_to_int(s : String) -> int:
	var ret : int = 0;
	var digit_multiplier = 1;
	while len(s):
		var lsb = s.substr(len(s) - 1, 1)
		s = s.substr(0, len(s) - 1)
		ret += digit_multiplier * lsb.to_int()
		digit_multiplier *= 8
	return ret

func _pad_to_512(x : int) -> int:
	var x_lowbits = x & 511
	var x_hibits = x & ~511
	
	if x_lowbits:
		x_hibits += 512
	
	return x_hibits

func open(path: String) -> Error:

	assert(not _reader)
	_reader = ZIPReader.new()
	var err = _reader.open(path)
	if err != OK:
		_reader.close()
		_reader = null
		return err

	load_cache()

	var tar_file_offset = 0
	var zero_filled_record_count = 0
	var zero_filled_record : PackedByteArray = []
	zero_filled_record.resize(512)
	zero_filled_record.fill(0)
	
	var paxheader_next_file = {}
	var paxheader_global = {}
	
	while tar_file_offset < len(_tar_file_cache):
		var chunk = _tar_file_cache.slice(tar_file_offset, tar_file_offset + 512)
		
		if chunk == zero_filled_record:
			zero_filled_record_count += 1
			if zero_filled_record_count >= 2:
				break
			tar_file_offset += 512
			continue
		
		var tar_record : TarFileRecord = TarFileRecord.new()
		
		var tar_chunk_name = chunk.slice(0, 100)
		var tar_chunk_size = chunk.slice(124, 124+12)
		var tar_chunk_mode = chunk.slice(100, 100+8)
		var tar_chunk_link_indicator = chunk.slice(156, 156+1)
		var tar_chunk_link_file = chunk.slice(157, 157+100)
		
		# FIXME: Technically "ustar\0" but we'll skip the \0
		var tar_ustar_indicator = chunk.slice(257, 257+5)
		var tar_ustar_file_prefix = chunk.slice(345, 345+155)

		# Pluck out the relevant bits we need for the record.
		tar_record.filename = tar_chunk_name.get_string_from_utf8()

		tar_record.file_size = _octal_str_to_int(tar_chunk_size.get_string_from_utf8())
		tar_record.mode = tar_chunk_mode.get_string_from_utf8()
		tar_record.is_link = (tar_chunk_link_indicator[0] != 0 and tar_chunk_link_indicator.get_string_from_utf8()[0] == "2")
		tar_record.link_destination = tar_chunk_link_file.get_string_from_utf8()
		
		tar_record.is_directory = (tar_chunk_link_indicator[0] != 0 and tar_chunk_link_indicator.get_string_from_utf8()[0] == "5")

		if tar_chunk_link_indicator[0] != 0:
			tar_record.type_indicator = tar_chunk_link_indicator.get_string_from_utf8()
		else:
			tar_record.type_indicator = ""
		
		# Append prefix if this is the "ustar" format.
		# TODO: Test this.
		if tar_ustar_indicator.get_string_from_utf8() == "ustar":
			tar_record.filename = \
				tar_ustar_file_prefix.get_string_from_utf8() + \
				tar_record.filename
		
		# TODO: Things we skipped:
		#       - owner id (108, 108+8)
		#       - group id (116, 116+8)
		#       - modification time (136, 136+12)
		#       - checksum (148, 148+8)
		#       - mosty related to USTAR format
		
		# Skip header.
		tar_file_offset += 512

		# Record start offset.
		tar_record.offset = tar_file_offset

		# Skip file contents.
		tar_file_offset += _pad_to_512(tar_record.file_size)
		
		if tar_record.filename.get_file() == "@PaxHeader":
			
			# This is a special file entry that just has some extended data
			# about the next file or all the following files. It's not an actual
			# file.
			var paxheader_data : PackedByteArray = _tar_file_cache.slice(
				tar_record.offset,
				tar_record.offset + tar_record.file_size)
			
			var paxheader_str : String = paxheader_data.get_string_from_utf8()
			
			# FIXME: Do some error checking here.
			var paxheader_lines = paxheader_str.split("\n", false)
			for line in paxheader_lines:
				var length_and_the_rest = line.split(" ")
				var key_and_value = length_and_the_rest[1].split("=")
				var key = key_and_value[0]
				var value = key_and_value[1]

				if tar_record.type_indicator == "x":
					paxheader_next_file[key] = value
				elif tar_record.type_indicator == "g":
					paxheader_global[key] = value

		else:
			
			# Apply paxheader. We're just using "path" for now.
			# See here for other available fields:
			#   https://pubs.opengroup.org/onlinepubs/009695399/utilities/pax.html
			var merged_paxheader : Dictionary = paxheader_global.duplicate()
			merged_paxheader.merge(paxheader_next_file, true)
			paxheader_next_file = {}
			
			if merged_paxheader.has("path"):
				tar_record.filename = merged_paxheader["path"]
				print("fixing path for paxheader: ", tar_record.filename)

			# Add it to our record list.
			_internal_file_list.append(tar_record)
	
	return OK

func _load_record(record : TarFileRecord) -> PackedByteArray:
	load_cache()
	return _tar_file_cache.slice(record.offset, record.offset + record.file_size)

func read_file(path : String, case_sensitive : bool = true) -> PackedByteArray:

	for record : TarFileRecord in _internal_file_list:
		if case_sensitive:
			if record.filename == path:
				return _load_record(record)
		else:
			if record.filename.nocasecmp_to(path) == 0:
				return _load_record(record)

	return []

func unpack_file(dest_path : String, filename : String):
	var full_dest_path : String = dest_path.path_join(filename)
	DirAccess.make_dir_recursive_absolute(full_dest_path.get_base_dir())
	
	for record : TarFileRecord in _internal_file_list:
		
		if record.filename.is_absolute_path():
			# hmmmmmmmmmmmmmm
			assert(false)
			continue
		
		if record.filename.simplify_path().begins_with(".."):
			assert(false)
			continue

		# FIXME: There are probably a million other ways to do directory
		#        traversal attacks.
		
		if record.filename == filename:
			if record.is_link:

				# Okay, look. I know that symbolic links technically exist on
				# Windows, but they're messy and hardly ever used. FIXME later
				# if for some reason you need to support that. -Kiri
				assert(OS.get_name() != "Windows")

				var err = OS.execute("ln", [
					"-s",
					record.link_destination,
					ProjectSettings.globalize_path(full_dest_path) ])
				assert(err != -1)
			
			elif record.is_directory:
				
				DirAccess.make_dir_recursive_absolute(full_dest_path)
			
			else:
			
				var file_data : PackedByteArray = read_file(record.filename)
				var out_file = FileAccess.open(full_dest_path, FileAccess.WRITE)
				out_file.store_buffer(file_data)
				out_file.close()
				
			# Set permissions.
			if not record.is_link:
				if OS.get_name() != "Windows":
					var err = OS.execute("chmod", [
						record.mode,
						ProjectSettings.globalize_path(full_dest_path) ])
					assert(err != -1)
Work from last night. 2024-07-14 08:54:08 -07:00			`# TARReader`
			`#`
			`# Read .tar.zip files. Interface mostly identical to ZIPReader.`
			`#`
			`# Why .tar.zip instead of .tar.bz2, .tar.gz, .tar.zst, or something normal?`
			`# Godot supports loading files with GZip and Zstandard compression, but only`
			`# files that it's saved (with a header/footer), so it can't load normal .tar.gz`
			`# or .tar.zst files. It can load zips, though.`
			`#`
			`# DO NOT USE THIS ON UNTRUSTED DATA.`

			`extends RefCounted`
			`class_name TARReader`

			`#region Internal data`

			`class TarFileRecord:`
			`extends RefCounted`
			`var filename : String`
			`var offset : int`
			`var file_size : int`

			`# Unix file permissions.`
			`#`
			`# Technically this is an int, but we're just going to leave it as an octal`
			`# string because that's what we can feed right into chmod.`
			`var mode : String`

			`# Symlinks.`
			`var is_link : bool`
			`var link_destination : String`

			`var is_directory : bool`

			`var type_indicator : String`

			`var _internal_file_list = []`
			`var _reader : ZIPReader = null`
			`var _tar_file_cache : PackedByteArray = []`

			`#endregion`

			`#region Cache wrangling`

			`# We have to load the entire .tar file into memory with the way the ZipReader`
			`# API works, but we'll at least include an option to nuke the cache to free up`
			`# memory if you want to just leave the file open.`
			`#`
			`# This lets us avoid re-opening and decompressing the entire .tar every time we`
			`# need something out of it, while still letting us manually free the memory when`
			`# we won't need it for a while.`
			`func clear_cache():`
			`_tar_file_cache = []`

			`func load_cache() -> Error:`
			`assert(_reader)`

			`if len(_tar_file_cache):`
			`# Cache already in-memory.`
			`return OK`

			`var zip_file_list = _reader.get_files()`
			`if len(zip_file_list) != 1:`
			`return ERR_FILE_UNRECOGNIZED`

			`_tar_file_cache = _reader.read_file(zip_file_list[0])`

			`return OK`

			`#endregion`

			`func close() -> Error:`
			`_internal_file_list = []`
			`_reader.close()`
			`_reader = null`
			`clear_cache()`
			`return OK`

			`func file_exists(path: String, case_sensitive: bool = true) -> bool:`
			`for record : TarFileRecord in _internal_file_list:`
			`if case_sensitive:`
			`if record.filename == path:`
			`return true`
			`else:`
			`if record.filename.nocasecmp_to(path) == 0:`
			`return true`
			`return false`

			`func get_files() -> PackedStringArray:`
			`var ret : PackedStringArray = []`
			`for record : TarFileRecord in _internal_file_list:`
			`ret.append(record.filename)`
			`return ret`

			`func _octal_str_to_int(s : String) -> int:`
			`var ret : int = 0;`
			`var digit_multiplier = 1;`
			`while len(s):`
			`var lsb = s.substr(len(s) - 1, 1)`
			`s = s.substr(0, len(s) - 1)`
			`ret += digit_multiplier * lsb.to_int()`
			`digit_multiplier *= 8`
			`return ret`

			`func _pad_to_512(x : int) -> int:`
			`var x_lowbits = x & 511`
			`var x_hibits = x & ~511`

			`if x_lowbits:`
			`x_hibits += 512`

			`return x_hibits`

			`func open(path: String) -> Error:`

			`assert(not _reader)`
			`_reader = ZIPReader.new()`
			`var err = _reader.open(path)`
			`if err != OK:`
			`_reader.close()`
			`_reader = null`
			`return err`

			`load_cache()`

			`var tar_file_offset = 0`
			`var zero_filled_record_count = 0`
			`var zero_filled_record : PackedByteArray = []`
			`zero_filled_record.resize(512)`
			`zero_filled_record.fill(0)`

			`var paxheader_next_file = {}`
			`var paxheader_global = {}`

			`while tar_file_offset < len(_tar_file_cache):`
			`var chunk = _tar_file_cache.slice(tar_file_offset, tar_file_offset + 512)`

			`if chunk == zero_filled_record:`
			`zero_filled_record_count += 1`
			`if zero_filled_record_count >= 2:`
			`break`
			`tar_file_offset += 512`
			`continue`

			`var tar_record : TarFileRecord = TarFileRecord.new()`

			`var tar_chunk_name = chunk.slice(0, 100)`
			`var tar_chunk_size = chunk.slice(124, 124+12)`
			`var tar_chunk_mode = chunk.slice(100, 100+8)`
			`var tar_chunk_link_indicator = chunk.slice(156, 156+1)`
			`var tar_chunk_link_file = chunk.slice(157, 157+100)`

			`# FIXME: Technically "ustar\0" but we'll skip the \0`
			`var tar_ustar_indicator = chunk.slice(257, 257+5)`
			`var tar_ustar_file_prefix = chunk.slice(345, 345+155)`

			`# Pluck out the relevant bits we need for the record.`
			`tar_record.filename = tar_chunk_name.get_string_from_utf8()`

			`tar_record.file_size = _octal_str_to_int(tar_chunk_size.get_string_from_utf8())`
			`tar_record.mode = tar_chunk_mode.get_string_from_utf8()`
			`tar_record.is_link = (tar_chunk_link_indicator[0] != 0 and tar_chunk_link_indicator.get_string_from_utf8()[0] == "2")`
			`tar_record.link_destination = tar_chunk_link_file.get_string_from_utf8()`

			`tar_record.is_directory = (tar_chunk_link_indicator[0] != 0 and tar_chunk_link_indicator.get_string_from_utf8()[0] == "5")`

			`if tar_chunk_link_indicator[0] != 0:`
			`tar_record.type_indicator = tar_chunk_link_indicator.get_string_from_utf8()`
			`else:`
			`tar_record.type_indicator = ""`

			`# Append prefix if this is the "ustar" format.`
			`# TODO: Test this.`
			`if tar_ustar_indicator.get_string_from_utf8() == "ustar":`
			`tar_record.filename = \`
			`tar_ustar_file_prefix.get_string_from_utf8() + \`
			`tar_record.filename`

			`# TODO: Things we skipped:`
			`# - owner id (108, 108+8)`
			`# - group id (116, 116+8)`
			`# - modification time (136, 136+12)`
			`# - checksum (148, 148+8)`
			`# - mosty related to USTAR format`

			`# Skip header.`
			`tar_file_offset += 512`

			`# Record start offset.`
			`tar_record.offset = tar_file_offset`

			`# Skip file contents.`
			`tar_file_offset += _pad_to_512(tar_record.file_size)`

			`if tar_record.filename.get_file() == "@PaxHeader":`

			`# This is a special file entry that just has some extended data`
			`# about the next file or all the following files. It's not an actual`
			`# file.`
			`var paxheader_data : PackedByteArray = _tar_file_cache.slice(`
			`tar_record.offset,`
			`tar_record.offset + tar_record.file_size)`

			`var paxheader_str : String = paxheader_data.get_string_from_utf8()`

			`# FIXME: Do some error checking here.`
			`var paxheader_lines = paxheader_str.split("\n", false)`
			`for line in paxheader_lines:`
			`var length_and_the_rest = line.split(" ")`
			`var key_and_value = length_and_the_rest[1].split("=")`
			`var key = key_and_value[0]`
			`var value = key_and_value[1]`

			`if tar_record.type_indicator == "x":`
			`paxheader_next_file[key] = value`
			`elif tar_record.type_indicator == "g":`
			`paxheader_global[key] = value`

			`else:`

			`# Apply paxheader. We're just using "path" for now.`
			`# See here for other available fields:`
			`# https://pubs.opengroup.org/onlinepubs/009695399/utilities/pax.html`
			`var merged_paxheader : Dictionary = paxheader_global.duplicate()`
			`merged_paxheader.merge(paxheader_next_file, true)`
			`paxheader_next_file = {}`

			`if merged_paxheader.has("path"):`
			`tar_record.filename = merged_paxheader["path"]`
			`print("fixing path for paxheader: ", tar_record.filename)`

			`# Add it to our record list.`
			`_internal_file_list.append(tar_record)`

			`return OK`

			`func _load_record(record : TarFileRecord) -> PackedByteArray:`
			`load_cache()`
			`return _tar_file_cache.slice(record.offset, record.offset + record.file_size)`

			`func read_file(path : String, case_sensitive : bool = true) -> PackedByteArray:`

			`for record : TarFileRecord in _internal_file_list:`
			`if case_sensitive:`
			`if record.filename == path:`
			`return _load_record(record)`
			`else:`
			`if record.filename.nocasecmp_to(path) == 0:`
			`return _load_record(record)`

			`return []`

			`func unpack_file(dest_path : String, filename : String):`
			`var full_dest_path : String = dest_path.path_join(filename)`
			`DirAccess.make_dir_recursive_absolute(full_dest_path.get_base_dir())`

			`for record : TarFileRecord in _internal_file_list:`

			`if record.filename.is_absolute_path():`
			`# hmmmmmmmmmmmmmm`
			`assert(false)`
			`continue`

			`if record.filename.simplify_path().begins_with(".."):`
			`assert(false)`
			`continue`

			`# FIXME: There are probably a million other ways to do directory`
			`# traversal attacks.`

			`if record.filename == filename:`
			`if record.is_link:`

			`# Okay, look. I know that symbolic links technically exist on`
			`# Windows, but they're messy and hardly ever used. FIXME later`
			`# if for some reason you need to support that. -Kiri`
			`assert(OS.get_name() != "Windows")`

			`var err = OS.execute("ln", [`
			`"-s",`
			`record.link_destination,`
			`ProjectSettings.globalize_path(full_dest_path) ])`
			`assert(err != -1)`

			`elif record.is_directory:`

			`DirAccess.make_dir_recursive_absolute(full_dest_path)`

			`else:`

			`var file_data : PackedByteArray = read_file(record.filename)`
			`var out_file = FileAccess.open(full_dest_path, FileAccess.WRITE)`
			`out_file.store_buffer(file_data)`
			`out_file.close()`

			`# Set permissions.`
			`if not record.is_link:`
			`if OS.get_name() != "Windows":`
			`var err = OS.execute("chmod", [`
			`record.mode,`
			`ProjectSettings.globalize_path(full_dest_path) ])`
			`assert(err != -1)`