2011-10-19 26 views
14

Mục đích chỉ là để lấy một tập tin cụ thể mà không cần tải toàn bộ nội dung, sử dụng phương pháp dãy HTTP như mô tả: http://www.codeproject.com/KB/cs/remotezip.aspxCó thư viện nào để truy xuất tệp từ một zip từ xa không?

+1

Đó phương pháp rõ ràng không tính đến các thư mục con. Câu hỏi hay, nhân tiện. – Blender

+0

Giao thức nào bạn muốn sử dụng? ssh (scp), ftp, sftp, http, ..? – chown

+0

HTTP, như được mô tả trong câu hỏi, nó nên sử dụng các yêu cầu phạm vi HTTP. –

Trả lời

16

Bạn có thể giải quyết việc này một chút thường với mã ít hơn. Về cơ bản, tạo đủ đối tượng giống như tệp cho ZipFile để sử dụng. Vì vậy, bạn kết thúc với z = ZipFile(HttpFile(url)) và nó tự động tải xuống chỉ phần cần thiết. Lợi thế với điều này là bạn viết ít mã hơn và nó không chỉ áp dụng cho nhiều tệp zip. (Trong thực tế, tôi tự hỏi nếu có một cái gì đó như thế này đã ... Tôi không tìm thấy nó mặc dù.)

Sử dụng ý tưởng tương tự, bạn cũng có thể tạo một bộ đệm ẩn cho HttpFile để tránh tải xuống lặp đi lặp lại.

Và đây là đoạn code: (lưu ý sự thiếu lỗi xử lý)

#!/usr/bin/python 
import urllib2 

class HttpFile(object): 
    def __init__(self, url): 
     self.url = url 
     self.offset = 0 
     self._size = -1 

    def size(self): 
     if self._size < 0: 
      f = urllib2.urlopen(self.url) 
      self._size = int(f.headers["Content-length"]) 
     return self._size 

    def read(self, count=-1): 
     req = urllib2.Request(self.url) 
     if count < 0: 
      end = self.size() - 1 
     else: 
      end = self.offset + count - 1 
     req.headers['Range'] = "bytes=%s-%s" % (self.offset, end) 
     f = urllib2.urlopen(req) 
     data = f.read() 
     # FIXME: should check that we got the range expected, etc. 
     chunk = len(data) 
     if count >= 0: 
      assert chunk == count 
     self.offset += chunk 
     return data 

    def seek(self, offset, whence=0): 
     if whence == 0: 
      self.offset = offset 
     elif whence == 1: 
      self.offset += offset 
     elif whence == 2: 
      self.offset = self.size() + offset 
     else: 
      raise Exception("Invalid whence") 

    def tell(self): 
     return self.offset 
+0

Triển khai tuyệt vời. Cảm ơn –

6

Vì không có thư viện như vậy tôi đã viết một module nhỏ bản thân mình, hầu hết các mã và logic là là từ tệp zip với tìm kiếm/lượt dịch sang các yêu cầu phạm vi HTTP.

Hãy thoải mái để xem xét và đề xuất cải tiến:

Mã:

""" 
Read remote ZIP files using HTTP range requests 
""" 
import struct 
import urllib2 
import zlib 
import cStringIO 
from zipfile import ZipInfo, ZipExtFile, ZipInfo 
from os.path import join, basename 

# The code is mostly adatpted from the zipfile module 
# NOTE: ZIP64 is not supported 

# The "end of central directory" structure, magic number, size, and indices 
# (section V.I in the format document) 
structEndArchive = "<4s4H2LH" 
stringEndArchive = "PK\005\006" 
sizeEndCentDir = struct.calcsize(structEndArchive) 

_ECD_SIGNATURE = 0 
_ECD_DISK_NUMBER = 1 
_ECD_DISK_START = 2 
_ECD_ENTRIES_THIS_DISK = 3 
_ECD_ENTRIES_TOTAL = 4 
_ECD_SIZE = 5 
_ECD_OFFSET = 6 
_ECD_COMMENT_SIZE = 7 
# These last two indices are not part of the structure as defined in the 
# spec, but they are used internally by this module as a convenience 
_ECD_COMMENT = 8 
_ECD_LOCATION = 9 

# The "central directory" structure, magic number, size, and indices 
# of entries in the structure (section V.F in the format document) 
structCentralDir = "<4s4B4HL2L5H2L" 
stringCentralDir = "PK\001\002" 
sizeCentralDir = struct.calcsize(structCentralDir) 

# indexes of entries in the central directory structure 
_CD_SIGNATURE = 0 
_CD_CREATE_VERSION = 1 
_CD_CREATE_SYSTEM = 2 
_CD_EXTRACT_VERSION = 3 
_CD_EXTRACT_SYSTEM = 4 
_CD_FLAG_BITS = 5 
_CD_COMPRESS_TYPE = 6 
_CD_TIME = 7 
_CD_DATE = 8 
_CD_CRC = 9 
_CD_COMPRESSED_SIZE = 10 
_CD_UNCOMPRESSED_SIZE = 11 
_CD_FILENAME_LENGTH = 12 
_CD_EXTRA_FIELD_LENGTH = 13 
_CD_COMMENT_LENGTH = 14 
_CD_DISK_NUMBER_START = 15 
_CD_INTERNAL_FILE_ATTRIBUTES = 16 
_CD_EXTERNAL_FILE_ATTRIBUTES = 17 
_CD_LOCAL_HEADER_OFFSET = 18 

# The "local file header" structure, magic number, size, and indices 
# (section V.A in the format document) 
structFileHeader = "<4s2B4HL2L2H" 
stringFileHeader = "PK\003\004" 
sizeFileHeader = struct.calcsize(structFileHeader) 

_FH_SIGNATURE = 0 
_FH_EXTRACT_VERSION = 1 
_FH_EXTRACT_SYSTEM = 2 
_FH_GENERAL_PURPOSE_FLAG_BITS = 3 
_FH_COMPRESSION_METHOD = 4 
_FH_LAST_MOD_TIME = 5 
_FH_LAST_MOD_DATE = 6 
_FH_CRC = 7 
_FH_COMPRESSED_SIZE = 8 
_FH_UNCOMPRESSED_SIZE = 9 
_FH_FILENAME_LENGTH = 10 
_FH_EXTRA_FIELD_LENGTH = 11 


def _http_get_partial_data(url, start_range, end_range=None): 
    req = urllib2.Request(url) 
    range_header = "bytes=%s" % start_range 
    if end_range is not None: 
     range_header += "-%s" % end_range 
    req.headers['Range'] = range_header 
    f = urllib2.urlopen(req)  
    return f 


def _EndRecData(url): 
    """Return data from the "End of Central Directory" record, or None. 

    The data is a list of the nine items in the ZIP "End of central dir" 
    record followed by a tenth item, the file seek offset of this record.""" 
    ECD = _http_get_partial_data(url, -sizeEndCentDir) 
    content_range = ECD.headers.get('Content-Range') 
    filesize = int(content_range.split('/')[1]) if content_range and '/' in content_range else 0 
    data = ECD.read() 
    ECD.close() 
    if data[0:4] == stringEndArchive and data[-2:] == "\000\000": 
     # the signature is correct and there's no comment, unpack structure 
     endrec = struct.unpack(structEndArchive, data) 
     endrec = list(endrec) 

     # Append a blank comment and record start offset 
     endrec.append("") 
     endrec.append(filesize - sizeEndCentDir) 
     return endrec 
    # Either this is not a ZIP file, or it is a ZIP file with an archive 
    # comment. Search the end of the file for the "end of central directory" 
    # record signature. The comment is the last item in the ZIP file and may be 
    # up to 64K long. It is assumed that the "end of central directory" magic 
    # number does not appear in the comment. 

    # Search by retrieving chunks of 256, 1k and 64k 
    try_ranges = (1 << 8, 1 << 10, 1 << 16) 
    for check_range in try_ranges: 
     ECD = _http_get_partial_data(url, -(check_range + sizeEndCentDir))  
     data = ECD.read()  
     content_range = ECD.headers.get('Content-Range')  
     ECD.close() 
     download_start = content_range.split('-')[0] 
     start = data.rfind(stringEndArchive)   
     if start >= 0:   
      # found the magic number; attempt to unpack and interpret 
      recData = data[start:start+sizeEndCentDir] 
      endrec = list(struct.unpack(structEndArchive, recData)) 
      commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file 
      comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize] 
      endrec.append(comment) 
      endrec.append(download_start + start)   
      return endrec 

    raise IOError 


class HTTPZipFile: 
    def __init__(self, url): 
     self.url = url 
     self.NameToInfo = {} # Find file info given name 
     self.filelist = []  # List of ZipInfo instances for archive 
     self.pwd = None 
     self.comment = '' 
     self.debug = 0 
     self._RealGetContents()  

    def _RealGetContents(self): 
     """Read in the table of contents for the ZIP file.""" 
     try: 
      endrec = _EndRecData(self.url) 
     except IOError: 
      raise BadZipfile("File is not a zip file") 
     if not endrec: 
      raise BadZipfile, "File is not a zip file" 
     if self.debug > 1: 
      print endrec 
     size_cd = endrec[_ECD_SIZE]    # bytes in central directory 
     offset_cd = endrec[_ECD_OFFSET]   # offset of central directory 
     self.comment = endrec[_ECD_COMMENT]  # archive comment 

     # "concat" is zero, unless zip was concatenated to another file 
     concat = endrec[_ECD_LOCATION] - size_cd - offset_cd 
     #if endrec[_ECD_SIGNATURE] == stringEndArchive64: 
     # # If Zip64 extension structures are present, account for them 
     # concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) 

     if self.debug > 2: 
      inferred = concat + offset_cd 
      print "given, inferred, offset", offset_cd, inferred, concat 
     # self.start_dir: Position of start of central directory 
     self.start_dir = offset_cd + concat 
     ECD = _http_get_partial_data(self.url, self.start_dir, self.start_dir+size_cd-1) 
     data = ECD.read() 
     ECD.close() 
     fp = cStringIO.StringIO(data)    
     total = 0 
     while total < size_cd: 
      centdir = fp.read(sizeCentralDir) 
      if centdir[0:4] != stringCentralDir: 
       raise BadZipfile, "Bad magic number for central directory" 
      centdir = struct.unpack(structCentralDir, centdir) 
      if self.debug > 2: 
       print centdir 
      filename = fp.read(centdir[_CD_FILENAME_LENGTH]) 
      # Create ZipInfo instance to store file information 
      x = ZipInfo(filename) 
      x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) 
      x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) 
      x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] 
      (x.create_version, x.create_system, x.extract_version, x.reserved, 
       x.flag_bits, x.compress_type, t, d, 
       x.CRC, x.compress_size, x.file_size) = centdir[1:12] 
      x.volume, x.internal_attr, x.external_attr = centdir[15:18] 
      # Convert date/time code to (year, month, day, hour, min, sec) 
      x._raw_time = t 
      x.date_time = ((d>>9)+1980, (d>>5)&0xF, d&0x1F, 
            t>>11, (t>>5)&0x3F, (t&0x1F) * 2) 

      x._decodeExtra() 
      x.header_offset = x.header_offset + concat 
      x.filename = x._decodeFilename() 
      self.filelist.append(x) 
      self.NameToInfo[x.filename] = x 

      # update total bytes read from central directory 
      total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] 
        + centdir[_CD_EXTRA_FIELD_LENGTH] 
        + centdir[_CD_COMMENT_LENGTH]) 

     if self.debug > 2: 
      print "total", total 

    def namelist(self): 
     """Return a list of file names in the archive.""" 
     l = [] 
     for data in self.filelist: 
      l.append(data.filename) 
     return l 

    def infolist(self): 
     """Return a list of class ZipInfo instances for files in the 
     archive.""" 
     return self.filelist 

    def printdir(self): 
     """Print a table of contents for the zip file.""" 
     print "%-46s %19s %12s" % ("File Name", "Modified ", "Size") 
     for zinfo in self.filelist: 
      date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6] 
      print "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size) 

    def getinfo(self, name): 
     """Return the instance of ZipInfo given 'name'.""" 
     info = self.NameToInfo.get(name) 
     if info is None: 
      raise KeyError(
       'There is no item named %r in the archive' % name) 

     return info   

    def open(self, name, pwd=None): 
     """Return file-like object for 'name'.""" 
     if not self.url: 
      raise RuntimeError, \ 
        "Attempt to read ZIP archive that was already closed" 
     zinfo = self.getinfo(name) 
     offset = zinfo.header_offset 
     f = _http_get_partial_data(self.url, offset, offset+sizeFileHeader-1) 
     fheader = f.read() 
     f.close() 

     fheader = struct.unpack(structFileHeader, fheader) 
     offset += sizeFileHeader 
     f = _http_get_partial_data(self.url, offset, offset+fheader[_FH_FILENAME_LENGTH]-1) 
     fname = f.read() 
     f.close() 

     if fname != zinfo.orig_filename: 
      raise BadZipfile, \ 
         'File name in directory "%s" and header "%s" differ.' % (
          zinfo.orig_filename, fname) 

     is_encrypted = zinfo.flag_bits & 0x1 
     if is_encrypted: 
      raise RuntimeError, "File %s is encrypted, " \ 
        "not supported." % name 

     offset += fheader[_FH_FILENAME_LENGTH]+fheader[_FH_EXTRA_FIELD_LENGTH] 
     f = _http_get_partial_data(self.url, offset, offset+fheader[_FH_COMPRESSED_SIZE]-1) 
     data = f.read() 
     return ZipExtFile(cStringIO.StringIO(data), 'r', zinfo) 


if __name__ == "__main__": 
    # Some tests 
    link="http://dfn.dl.sourceforge.net/project/filezilla/FileZilla_Client/3.5.1/FileZilla_3.5.1_win32.zip" 
    hzfile = HTTPZipFile(link) 
    hzfile.printdir() 
    for fname in ('GPL.html', 'resources/blukis/48x48/filter.png', 'resources/finished.wav'): 
     source_name = join('FileZilla-3.5.1', fname) 
     dest_fname = join('/tmp', basename(fname)) 
     print "Extracing %s to %s" % (source_name, dest_fname) 
     with hzfile.open(source_name) as f: 
      data = f.read() 
      new_file = open(dest_fname, 'w') 
      new_file.write(data) 
      new_file.close() 
Các vấn đề liên quan