twitter · dvryaboy · Feb 11, 2012 · Feb 11, 2012 · Feb 11, 2012 · Feb 12, 2012
diff --git a/src/java/com/hadoop/compression/lzo/LzoBasicIndexSerde.java b/src/java/com/hadoop/compression/lzo/LzoBasicIndexSerde.java
@@ -0,0 +1,103 @@
+/*
+ * This file is part of Hadoop-Gpl-Compression.
+ *
+ * Hadoop-Gpl-Compression is free software: you can redistribute it
+ * and/or modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * Hadoop-Gpl-Compression is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Hadoop-Gpl-Compression.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+package com.hadoop.compression.lzo;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.IOUtils;
+
+public class LzoBasicIndexSerde implements LzoIndexSerde {
+
+  private static final int BUFFER_CAPACITY = 16 * 1024 * 8; //size for a 4GB file (with 256KB lzo blocks)
+
+  private DataOutputStream os;
+  private DataInputStream is;
+  private ByteBuffer bytesIn;
+  private long firstLong;
+  private int numBlocks = 0;
+  private boolean processedFirstLong = false;
+
+  @Override
+  public boolean accepts(long firstLong) {
+    if (firstLong < 0) {
+      return false;
+    } else {
+      this.firstLong = firstLong;
+      return true;
+    }
+  }
+
+  @Override
+  public void prepareToWrite(DataOutputStream os) throws IOException {
+    this.os = os;
+  }
+
+  @Override
+  public void prepareToRead(DataInputStream is) throws IOException {
+    this.is = is;
+    bytesIn = fillBuffer();
+    numBlocks = bytesIn.remaining()/8 + 1; // plus one for the first long.
+    processedFirstLong = false;
+  }
+
+  @Override
+  public void writeOffset(long offset) throws IOException {
+    os.writeLong(offset);
+  }
+
+  @Override
+  public void finishWriting() throws IOException {
+    os.close();
+  }
+
+  @Override
+  public boolean hasNext() throws IOException {
+   return !processedFirstLong || (bytesIn != null && bytesIn.hasRemaining());
+  }
+
+  @Override
+  public long next() throws IOException {
+    if (!processedFirstLong) {
+      processedFirstLong = true;
+      return firstLong;
+    }
+    if (bytesIn != null && bytesIn.hasRemaining()) {
+      return bytesIn.getLong();
+    } else {
+      throw new IOException("Attempt to read past the edge of the index.");
+    }
+  }
+
+  private ByteBuffer fillBuffer() throws IOException {
+    DataOutputBuffer bytes = new DataOutputBuffer(BUFFER_CAPACITY);
+    // copy indexIn and close it if finished
+    IOUtils.copyBytes(is, bytes, 4*1024, true);
+    return ByteBuffer.wrap(bytes.getData(), 0, bytes.getLength());
+  }
+
+  @Override
+  public int numBlocks() {
+    return numBlocks;
+  }
+
+}
diff --git a/src/java/com/hadoop/compression/lzo/LzoIndex.java b/src/java/com/hadoop/compression/lzo/LzoIndex.java
@@ -20,7 +20,7 @@
 
 import java.io.EOFException;
 import java.io.IOException;
-import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.Arrays;
 
 import org.apache.hadoop.conf.Configurable;
@@ -29,8 +29,6 @@
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.CompressionCodecFactory;
 
@@ -44,6 +42,13 @@ public class LzoIndex {
 
   private long[] blockPositions_;
 
+  private static ArrayList<Class<? extends LzoIndexSerde>> serdeClasses =
+      new ArrayList<Class<? extends LzoIndexSerde>>();
+  static {
+    serdeClasses.add(LzoBasicIndexSerde.class);
+    serdeClasses.add(LzoTinyOffsetsSerde.class);
+  }
+
   /**
    * Create an empty index, typically indicating no index file exists.
    */
@@ -175,21 +180,21 @@ public static LzoIndex readIndex(FileSystem fs, Path lzoFile) throws IOException
       // return empty index, fall back to the unsplittable mode
       return new LzoIndex();
     }
-
-    int capacity = 16 * 1024 * 8; //size for a 4GB file (with 256KB lzo blocks)
-    DataOutputBuffer bytes = new DataOutputBuffer(capacity);
-
-    // copy indexIn and close it
-    IOUtils.copyBytes(indexIn, bytes, 4*1024, true);
-
-    ByteBuffer bytesIn = ByteBuffer.wrap(bytes.getData(), 0, bytes.getLength());
-    int blocks = bytesIn.remaining()/8;
-    LzoIndex index = new LzoIndex(blocks);
-
-    for (int i = 0; i < blocks; i++) {
-      index.set(i, bytesIn.getLong());
+    long firstLong = indexIn.readLong();
+    LzoIndexSerde serde = null;
+    for (Class<? extends LzoIndexSerde> candidateClass : serdeClasses) {
+      LzoIndexSerde candidate = null;
+      candidate = quietGetInstance(candidateClass);
+      if (candidate.accepts(firstLong)) {
+        serde = candidate;
+        break;
+      }
+    }
+    serde.prepareToRead(indexIn);
+    LzoIndex index = new LzoIndex(serde.numBlocks());
+    for (int i = 0; i < serde.numBlocks(); i++) {
+      index.set(i, serde.next());
     }
-
     return index;
   }
 
@@ -217,6 +222,8 @@ public static void createIndex(FileSystem fs, Path lzoFile)
 
     FSDataInputStream is = null;
     FSDataOutputStream os = null;
+    LzoIndexSerde writer = new LzoTinyOffsetsSerde();
+
     Path outputFile = lzoFile.suffix(LZO_INDEX_SUFFIX);
     Path tmpOutputFile = lzoFile.suffix(LZO_TMP_INDEX_SUFFIX);
 
@@ -226,6 +233,7 @@ public static void createIndex(FileSystem fs, Path lzoFile)
     try {
       is = fs.open(lzoFile);
       os = fs.create(tmpOutputFile);
+      writer.prepareToWrite(os);
       LzopDecompressor decompressor = (LzopDecompressor) codec.createDecompressor();
       // Solely for reading the header
       codec.createInputStream(is, decompressor);
@@ -252,7 +260,7 @@ public static void createIndex(FileSystem fs, Path lzoFile)
             numDecompressedChecksums : numDecompressedChecksums + numCompressedChecksums;
         long pos = is.getPos();
         // write the pos of the block start
-        os.writeLong(pos - 8);
+        writer.writeOffset(pos - 8);
         // seek to the start of the next block, skip any checksums
         is.seek(pos + compressedBlockSize + (4 * numChecksumsToSkip));
       }
@@ -263,7 +271,7 @@ public static void createIndex(FileSystem fs, Path lzoFile)
       if (is != null) {
         is.close();
       }
-
+      writer.finishWriting();
       if (os != null) {
         os.close();
       }
@@ -277,5 +285,17 @@ public static void createIndex(FileSystem fs, Path lzoFile)
       }
     }
   }
+
+  private static LzoIndexSerde quietGetInstance(Class<? extends LzoIndexSerde> klass) throws IOException {
+    LzoIndexSerde instance = null;
+    try {
+      instance = klass.newInstance();
+    } catch (InstantiationException e) {
+      throw new IOException(e);
+    } catch (IllegalAccessException e) {
+      throw new IOException(e);
+    }
+    return instance;
+  }
 }
 
diff --git a/src/java/com/hadoop/compression/lzo/LzoIndexSerde.java b/src/java/com/hadoop/compression/lzo/LzoIndexSerde.java
@@ -0,0 +1,72 @@
+/*
+ * This file is part of Hadoop-Gpl-Compression.
+ *
+ * Hadoop-Gpl-Compression is free software: you can redistribute it
+ * and/or modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3 of
+ * the License, or (at your option) any later version.
+ *
+ * Hadoop-Gpl-Compression is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Hadoop-Gpl-Compression.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+package com.hadoop.compression.lzo;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+public interface LzoIndexSerde {
+
+  /**
+   * Serdes will be tried in order until one is found that accepts
+   * the offered format. A format is determined from the first 8
+   * bytes (represented as a long) written to the index file.
+   * <p>
+   * The first long is somewhat constrained: the topmost bit should be
+   * 1, the next 31 are a version number by which the appropriate SerDe
+   * is decided, and the next 32 can have arbitrary data (a header, or
+   * a length of the header, or an offset.. up to you).
+   *
+   * @param firstLong
+   * @return true if this format is recognized by the SerDe, false otherwise.
+   */
+  public boolean accepts(long firstLong);
+
+  public void prepareToWrite(DataOutputStream os) throws IOException;
+
+  /**
+   * Prepare to read the index. Note that the first 8 bits will have been already
+   * read from this stream, and passed to you in accepts() in the form of a long.
+   * @param is InputStream to read.
+   */
+  public void prepareToRead(DataInputStream is) throws IOException;
+
+  /**
+   * Write the next offset into the file. It is expected that
+   * the offsets are supplied in order. <code>prepareToWrite()</code>
+   * should be called before the first invocation of this method.
+   * @param offset
+   */
+  public void writeOffset(long offset) throws IOException;
+
+  public void finishWriting() throws IOException;
+
+  public boolean hasNext() throws IOException;
+
+  public long next() throws IOException;
+
+  /**
+   * Get the number of block expected to be read from this index.
+   * Will only be called after prepareToRead().
+   * @return number of block offsets that will be read back.
+   */
+  public int numBlocks();
+
+}