zip.go

package archiver

import (
	"bytes"
	"compress/flate"
	"fmt"
	"io"
	"io/ioutil"
	"log"
	"os"
	"path"
	"path/filepath"
	"strings"

	"github.com/dsnet/compress/bzip2"
	"github.com/klauspost/compress/zip"
	"github.com/klauspost/compress/zstd"
	"github.com/ulikunitz/xz"
)

// ZipCompressionMethod Compression type
type ZipCompressionMethod uint16

// Compression methods.
// see https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT.
// Note LZMA: Disabled - because 7z isn't able to unpack ZIP+LZMA ZIP+LZMA2 archives made this way - and vice versa.
const (
	Store   ZipCompressionMethod = 0
	Deflate ZipCompressionMethod = 8
	BZIP2   ZipCompressionMethod = 12
	LZMA    ZipCompressionMethod = 14
	ZSTD    ZipCompressionMethod = 93
	XZ      ZipCompressionMethod = 95
)

// Zip provides facilities for operating ZIP archives.
// See https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT.
type Zip struct {
	// The compression level to use, as described
	// in the compress/flate package.
	CompressionLevel int

	// Whether to overwrite existing files; if false,
	// an error is returned if the file exists.
	OverwriteExisting bool

	// Whether to make all the directories necessary
	// to create a zip archive in the desired path.
	MkdirAll bool

	// If enabled, selective compression will only
	// compress files which are not already in a
	// compressed format; this is decided based
	// simply on file extension.
	SelectiveCompression bool

	// A single top-level folder can be implicitly
	// created by the Archive or Unarchive methods
	// if the files to be added to the archive
	// or the files to be extracted from the archive
	// do not all have a common root. This roughly
	// mimics the behavior of archival tools integrated
	// into OS file browsers which create a subfolder
	// to avoid unexpectedly littering the destination
	// folder with potentially many files, causing a
	// problematic cleanup/organization situation.
	// This feature is available for both creation
	// and extraction of archives, but may be slightly
	// inefficient with lots and lots of files,
	// especially on extraction.
	ImplicitTopLevelFolder bool

	// Strip number of leading paths. This feature is available
	// only during unpacking of the entire archive.
	StripComponents int

	// If true, errors encountered during reading
	// or writing a single file will be logged and
	// the operation will continue on remaining files.
	ContinueOnError bool

	// Compression algorithm
	FileMethod ZipCompressionMethod
	zw         *zip.Writer
	zr         *zip.Reader
	ridx       int
	//decinitialized bool
}

// CheckExt ensures the file extension matches the format.
func (*Zip) CheckExt(filename string) error {
	if !strings.HasSuffix(filename, ".zip") {
		return fmt.Errorf("filename must have a .zip extension")
	}
	return nil
}

// Registering a global decompressor is not reentrant and may panic
func registerDecompressor(zr *zip.Reader) {
	// register zstd decompressor
	zr.RegisterDecompressor(uint16(ZSTD), func(r io.Reader) io.ReadCloser {
		zr, err := zstd.NewReader(r)
		if err != nil {
			return nil
		}
		return zr.IOReadCloser()
	})
	zr.RegisterDecompressor(uint16(BZIP2), func(r io.Reader) io.ReadCloser {
		bz2r, err := bzip2.NewReader(r, nil)
		if err != nil {
			return nil
		}
		return bz2r
	})
	zr.RegisterDecompressor(uint16(XZ), func(r io.Reader) io.ReadCloser {
		xr, err := xz.NewReader(r)
		if err != nil {
			return nil
		}
		return ioutil.NopCloser(xr)
	})
}

// CheckPath ensures the file extension matches the format.
func (*Zip) CheckPath(to, filename string) error {
	to, _ = filepath.Abs(to) //explicit the destination folder to prevent that 'string.HasPrefix' check can be 'bypassed' when no destination folder is supplied in input
	dest := filepath.Join(to, filename)
	//prevent path traversal attacks
	if !strings.HasPrefix(dest, to) {
		return &IllegalPathError{AbsolutePath: dest, Filename: filename}
	}
	return nil
}

// Archive creates a .zip file at destination containing
// the files listed in sources. The destination must end
// with ".zip". File paths can be those of regular files
// or directories. Regular files are stored at the 'root'
// of the archive, and directories are recursively added.
func (z *Zip) Archive(sources []string, destination string) error {
	err := z.CheckExt(destination)
	if err != nil {
		return fmt.Errorf("checking extension: %v", err)
	}
	if !z.OverwriteExisting && fileExists(destination) {
		return fmt.Errorf("file already exists: %s", destination)
	}

	// make the folder to contain the resulting archive
	// if it does not already exist
	destDir := filepath.Dir(destination)
	if z.MkdirAll && !fileExists(destDir) {
		err := mkdir(destDir, 0755)
		if err != nil {
			return fmt.Errorf("making folder for destination: %v", err)
		}
	}

	out, err := os.Create(destination)
	if err != nil {
		return fmt.Errorf("creating %s: %v", destination, err)
	}
	defer out.Close()

	err = z.Create(out)
	if err != nil {
		return fmt.Errorf("creating zip: %v", err)
	}
	defer z.Close()

	var topLevelFolder string
	if z.ImplicitTopLevelFolder && multipleTopLevels(sources) {
		topLevelFolder = folderNameFromFileName(destination)
	}

	for _, source := range sources {
		err := z.writeWalk(source, topLevelFolder, destination)
		if err != nil {
			return fmt.Errorf("walking %s: %v", source, err)
		}
	}

	return nil
}

// Unarchive unpacks the .zip file at source to destination.
// Destination will be treated as a folder name.
func (z *Zip) Unarchive(source, destination string) error {
	if !fileExists(destination) && z.MkdirAll {
		err := mkdir(destination, 0755)
		if err != nil {
			return fmt.Errorf("preparing destination: %v", err)
		}
	}

	file, err := os.Open(source)
	if err != nil {
		return fmt.Errorf("opening source file: %v", err)
	}
	defer file.Close()

	fileInfo, err := file.Stat()
	if err != nil {
		return fmt.Errorf("statting source file: %v", err)
	}

	err = z.Open(file, fileInfo.Size())
	if err != nil {
		return fmt.Errorf("opening zip archive for reading: %v", err)
	}
	defer z.Close()

	// if the files in the archive do not all share a common
	// root, then make sure we extract to a single subfolder
	// rather than potentially littering the destination...
	if z.ImplicitTopLevelFolder {
		files := make([]string, len(z.zr.File))
		for i := range z.zr.File {
			files[i] = z.zr.File[i].Name
		}
		if multipleTopLevels(files) {
			destination = filepath.Join(destination, folderNameFromFileName(source))
		}
	}

	for {
		err := z.extractNext(destination)
		if err == io.EOF {
			break
		}
		if err != nil {
			if z.ContinueOnError || IsIllegalPathError(err) {
				log.Printf("[ERROR] Reading file in zip archive: %v", err)
				continue
			}
			return fmt.Errorf("reading file in zip archive: %v", err)
		}
	}

	return nil
}

func (z *Zip) extractNext(to string) error {
	f, err := z.Read()
	if err != nil {
		return err // don't wrap error; calling loop must break on io.EOF
	}
	defer f.Close()

	header, ok := f.Header.(zip.FileHeader)
	if !ok {
		return fmt.Errorf("expected header to be zip.FileHeader but was %T", f.Header)
	}

	errPath := z.CheckPath(to, header.Name)
	if errPath != nil {
		return fmt.Errorf("checking path traversal attempt: %v", errPath)
	}

	if z.StripComponents > 0 {
		if strings.Count(header.Name, "/") < z.StripComponents {
			return nil // skip path with fewer components
		}

		for i := 0; i < z.StripComponents; i++ {
			slash := strings.Index(header.Name, "/")
			header.Name = header.Name[slash+1:]
		}
	}
	return z.extractFile(f, to, &header)
}

func (z *Zip) extractFile(f File, to string, header *zip.FileHeader) error {
	to = filepath.Join(to, header.Name)

	// if a directory, no content; simply make the directory and return
	if f.IsDir() {
		return mkdir(to, f.Mode())
	}

	// do not overwrite existing files, if configured
	if !z.OverwriteExisting && fileExists(to) {
		return fmt.Errorf("file already exists: %s", to)
	}

	// extract symbolic links as symbolic links
	if isSymlink(header.FileInfo()) {
		// symlink target is the contents of the file
		buf := new(bytes.Buffer)
		_, err := io.Copy(buf, f)
		if err != nil {
			return fmt.Errorf("%s: reading symlink target: %v", header.Name, err)
		}
		return writeNewSymbolicLink(to, strings.TrimSpace(buf.String()))
	}

	return writeNewFile(to, f, f.Mode())
}

func (z *Zip) writeWalk(source, topLevelFolder, destination string) error {
	sourceInfo, err := os.Stat(source)
	if err != nil {
		return fmt.Errorf("%s: stat: %v", source, err)
	}
	destAbs, err := filepath.Abs(destination)
	if err != nil {
		return fmt.Errorf("%s: getting absolute path of destination %s: %v", source, destination, err)
	}

	return filepath.Walk(source, func(fpath string, info os.FileInfo, err error) error {
		handleErr := func(err error) error {
			if z.ContinueOnError {
				log.Printf("[ERROR] Walking %s: %v", fpath, err)
				return nil
			}
			return err
		}
		if err != nil {
			return handleErr(fmt.Errorf("traversing %s: %v", fpath, err))
		}
		if info == nil {
			return handleErr(fmt.Errorf("%s: no file info", fpath))
		}

		// make sure we do not copy the output file into the output
		// file; that results in an infinite loop and disk exhaustion!
		fpathAbs, err := filepath.Abs(fpath)
		if err != nil {
			return handleErr(fmt.Errorf("%s: getting absolute path: %v", fpath, err))
		}
		if within(fpathAbs, destAbs) {
			return nil
		}

		// build the name to be used within the archive
		nameInArchive, err := makeNameInArchive(sourceInfo, source, topLevelFolder, fpath)
		if err != nil {
			return handleErr(err)
		}

		var file io.ReadCloser
		if info.Mode().IsRegular() {
			file, err = os.Open(fpath)
			if err != nil {
				return handleErr(fmt.Errorf("%s: opening: %v", fpath, err))
			}
			defer file.Close()
		}
		err = z.Write(File{
			FileInfo: FileInfo{
				FileInfo:   info,
				CustomName: nameInArchive,
			},
			ReadCloser: file,
		})
		if err != nil {
			return handleErr(fmt.Errorf("%s: writing: %s", fpath, err))
		}

		return nil
	})
}

// Create opens z for writing a ZIP archive to out.
func (z *Zip) Create(out io.Writer) error {
	if z.zw != nil {
		return fmt.Errorf("zip archive is already created for writing")
	}
	z.zw = zip.NewWriter(out)
	if z.CompressionLevel != flate.DefaultCompression {
		z.zw.RegisterCompressor(zip.Deflate, func(out io.Writer) (io.WriteCloser, error) {
			return flate.NewWriter(out, z.CompressionLevel)
		})
	}
	switch z.FileMethod {
	case BZIP2:
		z.zw.RegisterCompressor(uint16(BZIP2), func(out io.Writer) (io.WriteCloser, error) {
			return bzip2.NewWriter(out, &bzip2.WriterConfig{Level: z.CompressionLevel})
		})
	case ZSTD:
		z.zw.RegisterCompressor(uint16(ZSTD), func(out io.Writer) (io.WriteCloser, error) {
			return zstd.NewWriter(out)
		})
	case XZ:
		z.zw.RegisterCompressor(uint16(XZ), func(out io.Writer) (io.WriteCloser, error) {
			return xz.NewWriter(out)
		})
	}
	return nil
}

// Write writes f to z, which must have been opened for writing first.
func (z *Zip) Write(f File) error {
	if z.zw == nil {
		return fmt.Errorf("zip archive was not created for writing first")
	}
	if f.FileInfo == nil {
		return fmt.Errorf("no file info")
	}
	if f.FileInfo.Name() == "" {
		return fmt.Errorf("missing file name")
	}

	header, err := zip.FileInfoHeader(f)
	if err != nil {
		return fmt.Errorf("%s: getting header: %v", f.Name(), err)
	}

	if f.IsDir() {
		header.Name += "/" // required - strangely no mention of this in zip spec? but is in godoc...
		header.Method = zip.Store
	} else {
		ext := strings.ToLower(path.Ext(header.Name))
		if _, ok := compressedFormats[ext]; ok && z.SelectiveCompression {
			header.Method = zip.Store
		} else {
			header.Method = uint16(z.FileMethod)
		}
	}

	writer, err := z.zw.CreateHeader(header)
	if err != nil {
		return fmt.Errorf("%s: making header: %w", f.Name(), err)
	}

	return z.writeFile(f, writer)
}

func (z *Zip) writeFile(f File, writer io.Writer) error {
	if f.IsDir() {
		return nil // directories have no contents
	}
	if isSymlink(f) {
		// file body for symlinks is the symlink target
		linkTarget, err := os.Readlink(f.Name())
		if err != nil {
			return fmt.Errorf("%s: readlink: %v", f.Name(), err)
		}
		_, err = writer.Write([]byte(filepath.ToSlash(linkTarget)))
		if err != nil {
			return fmt.Errorf("%s: writing symlink target: %v", f.Name(), err)
		}
		return nil
	}

	if f.ReadCloser == nil {
		return fmt.Errorf("%s: no way to read file contents", f.Name())
	}
	_, err := io.Copy(writer, f)
	if err != nil {
		return fmt.Errorf("%s: copying contents: %w", f.Name(), err)
	}

	return nil
}

// Open opens z for reading an archive from in,
// which is expected to have the given size and
// which must be an io.ReaderAt.
func (z *Zip) Open(in io.Reader, size int64) error {
	inRdrAt, ok := in.(io.ReaderAt)
	if !ok {
		return fmt.Errorf("reader must be io.ReaderAt")
	}
	if z.zr != nil {
		return fmt.Errorf("zip archive is already open for reading")
	}
	var err error
	z.zr, err = zip.NewReader(inRdrAt, size)
	if err != nil {
		return fmt.Errorf("creating reader: %v", err)
	}
	registerDecompressor(z.zr)
	z.ridx = 0
	return nil
}

// Read reads the next file from z, which must have
// already been opened for reading. If there are no
// more files, the error is io.EOF. The File must
// be closed when finished reading from it.
func (z *Zip) Read() (File, error) {
	if z.zr == nil {
		return File{}, fmt.Errorf("zip archive is not open")
	}
	if z.ridx >= len(z.zr.File) {
		return File{}, io.EOF
	}

	// access the file and increment counter so that
	// if there is an error processing this file, the
	// caller can still iterate to the next file
	zf := z.zr.File[z.ridx]
	z.ridx++

	file := File{
		FileInfo: zf.FileInfo(),
		Header:   zf.FileHeader,
	}

	rc, err := zf.Open()
	if err != nil {
		return file, fmt.Errorf("%s: open compressed file: %v", zf.Name, err)
	}
	file.ReadCloser = rc

	return file, nil
}

// Close closes the zip archive(s) opened by Create and Open.
func (z *Zip) Close() error {
	if z.zr != nil {
		z.zr = nil
	}
	if z.zw != nil {
		zw := z.zw
		z.zw = nil
		return zw.Close()
	}
	return nil
}

// Walk calls walkFn for each visited item in archive.
func (z *Zip) Walk(archive string, walkFn WalkFunc) error {
	zr, err := zip.OpenReader(archive)
	if err != nil {
		return fmt.Errorf("opening zip reader: %v", err)
	}
	defer zr.Close()
	registerDecompressor(&zr.Reader)
	for _, zf := range zr.File {
		zfrc, err := zf.Open()
		if err != nil {
			if zfrc != nil {
				zfrc.Close()
			}
			if z.ContinueOnError {
				log.Printf("[ERROR] Opening %s: %v", zf.Name, err)
				continue
			}
			return fmt.Errorf("opening %s: %v", zf.Name, err)
		}

		err = walkFn(File{
			FileInfo:   zf.FileInfo(),
			Header:     zf.FileHeader,
			ReadCloser: zfrc,
		})
		zfrc.Close()
		if err != nil {
			if err == ErrStopWalk {
				break
			}
			if z.ContinueOnError {
				log.Printf("[ERROR] Walking %s: %v", zf.Name, err)
				continue
			}
			return fmt.Errorf("walking %s: %v", zf.Name, err)
		}
	}

	return nil
}

// Extract extracts a single file from the zip archive.
// If the target is a directory, the entire folder will
// be extracted into destination.
func (z *Zip) Extract(source, target, destination string) error {
	// target refers to a path inside the archive, which should be clean also
	target = path.Clean(target)

	// if the target ends up being a directory, then
	// we will continue walking and extracting files
	// until we are no longer within that directory
	var targetDirPath string

	return z.Walk(source, func(f File) error {
		zfh, ok := f.Header.(zip.FileHeader)
		if !ok {
			return fmt.Errorf("expected header to be zip.FileHeader but was %T", f.Header)
		}

		// importantly, cleaning the path strips tailing slash,
		// which must be appended to folders within the archive
		name := path.Clean(zfh.Name)
		if f.IsDir() && target == name {
			targetDirPath = path.Dir(name)
		}

		if within(target, zfh.Name) {
			// either this is the exact file we want, or is
			// in the directory we want to extract

			// build the filename we will extract to
			end, err := filepath.Rel(targetDirPath, zfh.Name)
			if err != nil {
				return fmt.Errorf("relativizing paths: %v", err)
			}
			joined := filepath.Join(destination, end)

			err = z.extractFile(f, joined, &zfh)
			if err != nil {
				return fmt.Errorf("extracting file %s: %v", zfh.Name, err)
			}

			// if our target was not a directory, stop walk
			if targetDirPath == "" {
				return ErrStopWalk
			}
		} else if targetDirPath != "" {
			// finished walking the entire directory
			return ErrStopWalk
		}

		return nil
	})
}

// Match returns true if the format of file matches this
// type's format. It should not affect reader position.
func (*Zip) Match(file io.ReadSeeker) (bool, error) {
	currentPos, err := file.Seek(0, io.SeekCurrent)
	if err != nil {
		return false, err
	}
	_, err = file.Seek(0, 0)
	if err != nil {
		return false, err
	}
	defer func() {
		_, _ = file.Seek(currentPos, io.SeekStart)
	}()

	buf := make([]byte, 4)
	if n, err := file.Read(buf); err != nil || n < 4 {
		return false, nil
	}
	return bytes.Equal(buf, []byte("PK\x03\x04")), nil
}

func (z *Zip) String() string { return "zip" }

// NewZip returns a new, default instance ready to be customized and used.
func NewZip() *Zip {
	return &Zip{
		CompressionLevel:     flate.DefaultCompression,
		MkdirAll:             true,
		SelectiveCompression: true,
		FileMethod:           Deflate,
	}
}

// Compile-time checks to ensure type implements desired interfaces.
var (
	_ = Reader(new(Zip))
	_ = Writer(new(Zip))
	_ = Archiver(new(Zip))
	_ = Unarchiver(new(Zip))
	_ = Walker(new(Zip))
	_ = Extractor(new(Zip))
	_ = Matcher(new(Zip))
	_ = ExtensionChecker(new(Zip))
	_ = FilenameChecker(new(Zip))
)

// compressedFormats is a (non-exhaustive) set of lowercased
// file extensions for formats that are typically already
// compressed. Compressing files that are already compressed
// is inefficient, so use this set of extension to avoid that.
var compressedFormats = map[string]struct{}{
	".7z":   {},
	".avi":  {},
	".br":   {},
	".bz2":  {},
	".cab":  {},
	".docx": {},
	".gif":  {},
	".gz":   {},
	".jar":  {},
	".jpeg": {},
	".jpg":  {},
	".lz":   {},
	".lz4":  {},
	".lzma": {},
	".m4v":  {},
	".mov":  {},
	".mp3":  {},
	".mp4":  {},
	".mpeg": {},
	".mpg":  {},
	".png":  {},
	".pptx": {},
	".rar":  {},
	".sz":   {},
	".tbz2": {},
	".tgz":  {},
	".tsz":  {},
	".txz":  {},
	".xlsx": {},
	".xz":   {},
	".zip":  {},
	".zipx": {},
}

// DefaultZip is a default instance that is conveniently ready to use.
var DefaultZip = NewZip()