diff options
Diffstat (limited to 'unpk.py')
-rwxr-xr-x | unpk.py | 519 |
1 files changed, 519 insertions, 0 deletions
diff --git a/unpk.py b/unpk.py new file mode 100755 index 000000000000..a6445324c01f --- /dev/null +++ b/unpk.py @@ -0,0 +1,519 @@ +import sys, os, struct, itertools, array + +# class PKStream is based on information from Ben Rudiak-Gould: +# http://groups.google.com/group/comp.compression/msg/48ea9de6d71a575b +# and implementation of Douglas Kane: +# http://groups.google.com/group/comp.compression/msg/aa014556d706c525 +# Archive extraction code was highly influenced by Gavin Claytons Daggerfall Jukebox: +# http://www.dfworkshop.net/?page_id=61 + +def file_bytestream(f): + while True: + temp = f.read(1) + if not temp: raise StopIteration + yield struct.unpack('B',temp)[0] + +class PKStream(object): + def __init__(self, source): + if isinstance(source, file): + self.bytestream = file_bytestream(source) + elif hasattr(source, "next"): + self.bytestream = source + elif hasattr(source, "__iter__"): + self.bytestream = source.__iter__() + else: + raise TypeError("Expected file or iterable object, got %s" % type(source)) + self.bits_read = 0 + self.prefixed_literals = self.read(8) + assert(self.prefixed_literals==0 or self.prefixed_literals==1) + self.dict_bytes = self.read(8) + assert(self.dict_bytes==4 or self.dict_bytes==5 or self.dict_bytes==6) + self.dict_size = 2**(self.dict_bytes+6) + self.dictionary = array.array('B',itertools.repeat(0,self.dict_size)) + self.current_key = 0 + + def read_byte(self): + self.bits_read = 8 + self.last_byte = self.bytestream.next() + + def read(self, n): + if self.bits_read==0: + self.read_byte() + if n<=self.bits_read: + temp = (self.last_byte >> (8-self.bits_read))&(0xff>>(8-n)) + self.bits_read -= n + return temp + else: + shift = 8-self.bits_read + res = [self.last_byte>>shift] + n -= self.bits_read + self.read_byte() + while n>8: + res[-1] = res[-1] | ((self.last_byte << (8-shift)) & 0xff) + res.append(self.last_byte >> shift) + self.read_byte() + n-=8 + self.bits_read = (8-n) + if n>shift: + res[-1] = res[-1] | ((self.last_byte << (8-shift)) & 0xff) + res.append((self.last_byte>>shift)&(0xff>>(8-n+shift))) + else: + res[-1] = res[-1] | (self.last_byte << (8-shift)) & (0xff >> (shift-n)) + temp = 0 + for i,v in enumerate(res): + temp += v<<(i*8) + return temp + + def read_rev(self,bits): + value = self.read(bits) + temp = 0 + for i in xrange(bits): + temp = temp << 1 + temp = temp | value & 0x0001 + value = value >> 1 + return temp + + def decode_literal(self): + if self.prefixed_literals: + temp - self.read_rev(4) + if temp==0xf: #1111 + return 0x20 + if temp==0xe: #1110 + if self.read(1): #11101 + return 0x45 + #11100 + return 0x61 + if temp==0xd: #1101 + if self.read(1): #11011 + return 0x65 + #11010 + return 0x69 + if temp==0xc: #1100 + if self.read(1): #11001 + return 0x6c + #11000 + return 0x6e + if temp==0xb: #1011 + if self.read(1): #10111 + return 0x6f + #10110 + return 0x72 + if temp==0xa: #1010x + return 0x74-self.read(1) + if temp==0x9: #1001 + if self.read(1): #10011 + return 0x75 + if self.read(1): #100101 + return 0x2d + #100100 + return 0x31 + if temp==0x8: #1000 + temp = self.read_rev(2) + if temp==0x3: #100011 + return 0x41 + if temp==0x2: #100010 + return 0x43 + if temp==0x1: #100001 + return 0x44 + #100000 + return 0x49 + if temp==0x7: #0111 + temp = self.read_rev(2) + if temp==0x3: #011111 + return 0x4c + if temp==0x2: #011110 + return 0x4e + if temp==0x1: #011101 + return 0x4f + #011100 + return 0x52 + if temp==0x6: #0110 + if self.read(1): #01101x + return 0x54-self.read(1) + #01100x + return 0x63-self.read(1) + if temp==0x5: #0101 + temp = self.read_rev(2) + if temp==0x3: #010111 + return 0x64 + #0101xx + return 0x68-temp + if temp==0x4: #0100 + if self.read(1): #01001 + if self.read(1): #010011 + return 0x6d + #010010 + return 0x70 + #01000 + if self.read(1): #010001 + if self.read(1): #0100011 + return 0x0a + #0100010 + return 0x0d + #010000x + return 0x29-self.read(1) + if temp==0x3: #0011 + temp = self.read_rev(3) + if temp==0x7: #0011111 + return 0x2c + if temp==0x6: #0011110 + return 0x2e + if temp==0x5: #0011101 + return 0x30 + if temp==0x0: #0011000 + return 0x37 + #0011xxx + return 0x36-temp + if temp==0x2: #0010 + temp = self.read_rev(3) + if temp==0x7: #0010111 + return 0x38 + if temp==0x6: #0010110 + return 0x3d + if temp==0x5: #0010101 + return 0x42 + if temp==0x4: #0010100 + return 0x46 + if temp==0x3: #0010011 + return 0x4d + if temp==0x2: #0010010 + return 0x50 + if temp==0x1: #0010001 + return 0x55 + #0010000 + return 0x6b + if temp==0x1: #0001 + temp = self.read_rev(3) + if temp==0x7: #0001111 + return 0x77 + if temp==0x6: #0001110 + if self.read(1): #00011100 + return 0x09 + #00011101 + return 0x22 + if temp==0x5: #0001101 + if self.read(1): #00011011 + return 0x27 + #00011010 + return 0x2a + if temp==0x4: #0001100 + if self.read(1): #00011001 + return 0x2f + #00011000 + return 0x36 + if temp==0x3: #0001011x + return 0x3a-self.read(1) + if temp==0x2: #0001010x + return 0x48-self.read(1) + if temp==0x1: #0001001 + if self.read(1): #00010011 + return 0x57 + #00010010 + return 0x5b + if self.read(1): #00010001 + return 0x5f + #00010000 + return 0x76 + #0000 + temp = self.read_rev(3) + if temp==0x7: #0000111x + return 0x79-self.read(1) + if temp==0x6: #0000110 + temp = self.read_rev(2) + if temp==0x3: #000011011 + return 0x2b + if temp==0x2: #000011010 + return 0x3e + if temp==0x1: #000011001 + return 0x4b + #000011000 + return 0x56 + if temp==0x5: #0000101 + temp = self.read_rev(2) + if temp==0x3: #000010111 + return 0x58 + if temp==0x2: #000010110 + return 0x59 + if temp==0x1: #000010101 + return 0x5d + if self.read(1): #0000101001 + return 0x21 + #0000101000 + return 0x24 + if temp==0x4: #0000100 + if self.read(1): #00001001 + temp = self.read_rev(2) + if temp==0x3: #0000100111 + return 0x26 + if temp==0x2: #0000100110 + return 0x71 + if temp==0x1: #0000100101 + return 0x7a + if self.read(1): #00001001001 + return 0x00 + #00001001000 + return 0x3c + temp = self.read_rev(3) + if temp==0x7: #00001000111 + return 0x3f + if temp==0x6: #00001000110 + return 0x4a + if temp==0x5: #00001000101 + return 0x51 + if temp==0x4: #00001000100 + return 0x5a + if temp==0x3: #00001000011 + return 0x5c + if temp==0x2: #00001000010 + return 0x6a + if temp==0x1: #00001000001 + return 0x7b + #00001000000 + return 0x7c + if temp==0x3: #0000011 + temp = self.read_rev(5) + if temp>=0x18: + return 0x20-temp + if temp>=0x16: + return 0x22-temp + if temp>=0x0a: + return 0x23-temp + if temp>=0x05: + return 0x24-temp + if temp==0x04: + return 0x23 + if temp==0x03: + return 0x25 + if temp==0x02: + return 0x3b + if temp==0x01: + return 0x40 + return 0x5e + if temp==0x2: #0000010 + temp = self.read_rev(5) + if temp==0x1f: + return 0x60 + if temp>=0x1c: + return 0x9b-temp + return 0xcb-temp + if temp==0x1: #0000001 + temp = self.read_rev(5) + if temp>=0x0c: + return 0xeb-temp + if temp==0x0b: + return 0xe1 + if temp==0x0a: + return 0xe5 + if temp==0x09: + return 0xe9 + if temp==0x08: + return 0xee + if temp>=0x05: + return 0xf9-temp + if temp==0x04: + if self.read(1): + return 0x1a + return 0x80 + if temp==0x03: + return 0x82-self.read(1) + if temp==0x02: + return 0x84-self.read(1) + if temp==0x01: + return 0x86-self.read(1) + return 0x88-self.read(1) + #0000000 + temp = self.read_rev(6) + if temp>=0x19: + return 0xc8-temp + if temp==0x18: + return 0xe0 + if temp>=0x15: + return 0xf9-temp + if temp>=0x12: + return 0xfa-temp + if temp>=0x0e: + return 0xfb-temp + if temp>=0x0b: + return 0xfc-temp + return 0xff-temp + return self.read(8) + def decode_copy_length(self): + temp = self.read_rev(2) + if temp==0x1: #01 + if self.read(1): #011 + return 5 + #010 + if self.read(1): #0101 + return 6 + #0100 + return 7 + if temp==0x2: # 10 + if self.read(1): #101 + return 2 + #100 + return 4 + if temp==0x3: #11 + return 3 + #00 + if self.read(1): #001 + if self.read(1): #0011 + return 8 + #0010 + if self.read(1): #00101 + return 9 + #00100x + return 10 + self.read(1) + #000 + if self.read(1): #0001 + if self.read(1): #00011xx + return 12+self.read(2) + #00010xxx + return 16+self.read(3) + #0000 + temp = self.read_rev(2) + if temp==0x3: #000011xxxx + return 24+self.read(4) + if temp==0x2: #000010xxxxx + return 40+self.read(5) + if temp==0x1: #000001xxxxxx + return 72+self.read(6) + #000000 + if self.read(1): #0000001xxxxxxx + return 136+self.read(7) + #0000000 + return 264+self.read(8) + + def calc_offset(self, high, low) : + return (high << low) | self.read(low) + + def decode_copy_offset(self, low): + temp = self.read(2) + if temp==0x3: #11 + return self.calc_offset(0x00, low) + if temp==0x1: #10 + if self.read(1): # 101 + if self.read(1): #1011 + return self.calc_offset(0x01, low) + #1010 + return self.calc_offset(0x02, low) + #100 + return self.calc_offset(0x06-self.read_rev(2), low) + if temp==0x2: #01 + temp = self.read_rev(4) + if temp: # 01xxxx + return self.calc_offset(0x16-temp, low) + # 010000 + return self.calc_offset(0x17-self.read(1), low) + #00 + if self.read(1): + return self.calc_offset(0x27-self.read_rev(4), low) + #000 + if self.read(1): + return self.calc_offset(0x2f-self.read_rev(3), low) + #0000 + return self.calc_offset(0x3f-self.read_rev(4), low) + + def get_next_token(self): + temp = self.read(1) + if temp==0: + return (0, self.decode_literal(),0,0) + length = self.decode_copy_length() + if length==519: # end of stream + return (-1,0,0,0) + if length==2: + low = 2 + else: + low = self.dict_bytes + return (1, 0, length, self.decode_copy_offset(low)) + + def decode(self): + tktype = 0 + apBuffer = array.array('B') + while tktype>=0: + (tktype, literal, length, offset) = self.get_next_token() + if tktype==0: + apBuffer.append(literal) + self.dictionary[self.current_key] = literal + self.current_key += 1 + if self.current_key == self.dict_size: + self.current_key = 0 + elif tktype==1: + start = (self.current_key-1-offset)%self.dict_size + ind = start + nexti = self.current_key + copies = 0 + while copies<length: + copies += 1 + apBuffer.append(self.dictionary[ind]) + self.dictionary[nexti] = self.dictionary[ind] + nexti += 1 + ind += 1 + if ind==self.current_key: + ind = start + if ind==self.dict_size: + ind = 0 + if nexti == self.dict_size: + nexti = 0 + self.current_key = nexti + return apBuffer + +def unpack_file(f, out, length): + g = open(out, "wb") + lenout = 0 + while lenout<length: + f.seek(36,1) + stream = PKStream(f).decode() + lenout += len(stream) + print (lenout*100/length),"%\x0d" + stream.write(g) + g.close() + +def unpack_header(f, at, names, offset): + f.seek(offset+at) + length, = struct.unpack('I', f.read(4)) + to = names[struct.unpack('I',f.read(4))[0]] + name = f.read(13).strip('\x00') + start, = struct.unpack('I', f.read(4)) + out = os.path.join(to, name) + f.seek(offset+start) + print "\"%s\" (length: %s bytes)"%(out,length) + unpack_file(f, out, length) + print "File \"%s\" unpacked\n"%out + +def unpack_archive(archive, directory, offset=0): + f = open(archive, "rb") + f.seek(offset) + start,end = struct.unpack('II', f.read(8)) + nfiles = (end-start)/25 + if not os.path.exists(directory): + os.mkdir(directory) + ndirs = 0 + f.seek(offset+start+4) + for i in xrange(nfiles): + ndirs=max(ndirs, struct.unpack('I',f.read(4))[0]) + f.seek(21,1) + names = [] + f.seek(offset+end) + for i in xrange(ndirs+1): + name = f.read(60).strip('\x00').replace("\\",os.path.sep) + if name!=".": + to = os.path.join(directory,name) + else: + to = directory + if not os.path.exists(to): + os.mkdir(to) + names.append(to) + print "Found %s files in archive %s at offset %s.\n"%(nfiles,archive,offset) + for i in xrange(nfiles): + print "Extracting file %s of %s,"%(i+1,nfiles), + unpack_header(f, start+i*25, names, offset) + f.close() + +if __name__ == "__main__": + if len(sys.argv)==3: + unpack_archive(*map(os.path.expandvars,map(os.path.expanduser,sys.argv[1:]))) + elif len(sys.argv)==4: + unpack_archive(*map(os.path.expandvars,map(os.path.expanduser,sys.argv[1:3])),offset=int(sys.argv[3])) + else: + print "usage: python2 unpk.py <archive> <target directory> [offset]" + |