Merge pull request #1695 from himanshug/allow_hadoop_based_input_row_…

…parser Allow writing InputRowParser extensions that use hadoop/any libraries
apache · Sep 16, 2015 · 42bd4f6 · 42bd4f6
2 parents b464da4 + e8b9ee8
commit 42bd4f6
Show file tree

Hide file tree

Showing 29 changed files with 754 additions and 254 deletions.
diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerConfig.java b/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerConfig.java
@@ -103,7 +103,8 @@ public void configure(Binder binder)
                     binder, Key.get(DruidNode.class, Self.class), new DruidNode("hadoop-indexer", null, null)
                 );
               }
-            }
+            },
+            new IndexingHadoopModule()
         )
     );
     jsonMapper = injector.getInstance(ObjectMapper.class);

diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerMapper.java b/indexing-hadoop/src/main/java/io/druid/indexer/HadoopDruidIndexerMapper.java
@@ -104,11 +104,9 @@ public final static InputRow parseInputRow(Object value, InputRowParser parser)
   {
     if (parser instanceof StringInputRowParser && value instanceof Text) {
       //Note: This is to ensure backward compatibility with 0.7.0 and before
+      //HadoopyStringInputRowParser can handle this and this special case is not needed
+      //except for backward compatibility
       return ((StringInputRowParser) parser).parse(value.toString());
-    } else if (parser instanceof StringInputRowParser && value instanceof BytesWritable) {
-      BytesWritable valueBytes = (BytesWritable) value;
-      ByteBuffer valueBuffer = ByteBuffer.wrap(valueBytes.getBytes(), 0, valueBytes.getLength());
-      return ((StringInputRowParser) parser).parse(valueBuffer);
     } else if (value instanceof InputRow) {
       return (InputRow) value;
     } else {

diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/HadoopyStringInputRowParser.java b/indexing-hadoop/src/main/java/io/druid/indexer/HadoopyStringInputRowParser.java
@@ -0,0 +1,69 @@
+/*
+* Licensed to Metamarkets Group Inc. (Metamarkets) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. Metamarkets licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied. See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+
+package io.druid.indexer;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.metamx.common.IAE;
+import io.druid.data.input.InputRow;
+import io.druid.data.input.impl.InputRowParser;
+import io.druid.data.input.impl.ParseSpec;
+import io.druid.data.input.impl.StringInputRowParser;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.Text;
+
+import java.nio.ByteBuffer;
+
+/**
+ */
+public class HadoopyStringInputRowParser implements InputRowParser<Object>
+{
+  private final StringInputRowParser parser;
+
+  public HadoopyStringInputRowParser(@JsonProperty("parseSpec") ParseSpec parseSpec)
+  {
+    this.parser = new StringInputRowParser(parseSpec);
+  }
+
+  @Override
+  public InputRow parse(Object input)
+  {
+    if (input instanceof Text) {
+      return parser.parse(((Text) input).toString());
+    } else if (input instanceof BytesWritable) {
+      BytesWritable valueBytes = (BytesWritable) input;
+      return parser.parse(ByteBuffer.wrap(valueBytes.getBytes(), 0, valueBytes.getLength()));
+    } else {
+      throw new IAE("can't convert type [%s] to InputRow", input.getClass().getName());
+    }
+  }
+
+  @JsonProperty
+  @Override
+  public ParseSpec getParseSpec()
+  {
+    return parser.getParseSpec();
+  }
+
+  @Override
+  public InputRowParser withParseSpec(ParseSpec parseSpec)
+  {
+    return new HadoopyStringInputRowParser(parseSpec);
+  }
+}
diff --git a/indexing-hadoop/src/main/java/io/druid/indexer/IndexingHadoopModule.java b/indexing-hadoop/src/main/java/io/druid/indexer/IndexingHadoopModule.java
@@ -0,0 +1,50 @@
+/*
+* Licensed to Metamarkets Group Inc. (Metamarkets) under one
+* or more contributor license agreements. See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership. Metamarkets licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied. See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+
+package io.druid.indexer;
+
+import com.fasterxml.jackson.databind.Module;
+import com.fasterxml.jackson.databind.jsontype.NamedType;
+import com.fasterxml.jackson.databind.module.SimpleModule;
+import com.google.inject.Binder;
+import io.druid.initialization.DruidModule;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ */
+public class IndexingHadoopModule implements DruidModule
+{
+  @Override
+  public List<? extends Module> getJacksonModules()
+  {
+    return Arrays.<Module>asList(
+        new SimpleModule("IndexingHadoopModule")
+            .registerSubtypes(
+                new NamedType(HadoopyStringInputRowParser.class, "hadoopyString")
+            )
+    );
+  }
+
+  @Override
+  public void configure(Binder binder)
+  {
+  }
+}
diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/BatchDeltaIngestionTest.java
@@ -340,21 +340,25 @@ private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(Map<String, Object
         new HadoopIngestionSpec(
             new DataSchema(
                 "website",
-                new StringInputRowParser(
-                    new CSVParseSpec(
-                        new TimestampSpec("timestamp", "yyyyMMddHH", null),
-                        new DimensionsSpec(ImmutableList.of("host"), null, null),
-                        null,
-                        ImmutableList.of("timestamp", "host", "host2", "visited_num")
-                    )
+                MAPPER.convertValue(
+                    new StringInputRowParser(
+                        new CSVParseSpec(
+                            new TimestampSpec("timestamp", "yyyyMMddHH", null),
+                            new DimensionsSpec(ImmutableList.of("host"), null, null),
+                            null,
+                            ImmutableList.of("timestamp", "host", "host2", "visited_num")
+                        )
+                    ),
+                    Map.class
                 ),
                 new AggregatorFactory[]{
                     new LongSumAggregatorFactory("visited_sum", "visited_num"),
                     new HyperUniquesAggregatorFactory("unique_hosts", "host2")
                 },
                 new UniformGranularitySpec(
                     Granularity.DAY, QueryGranularity.NONE, ImmutableList.of(INTERVAL_FULL)
-                )
+                ),
+                MAPPER
             ),
             new HadoopIOConfig(
                 inputSpec,

diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/DetermineHashedPartitionsJobTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/DetermineHashedPartitionsJobTest.java
@@ -108,26 +108,36 @@ public DetermineHashedPartitionsJobTest(String dataFilePath, long targetPartitio
     HadoopIngestionSpec ingestionSpec = new HadoopIngestionSpec(
         new DataSchema(
             "test_schema",
-            new StringInputRowParser(
-                new DelimitedParseSpec(
-                    new TimestampSpec("ts", null, null),
-                    new DimensionsSpec(ImmutableList.of("market", "quality", "placement", "placementish"), null, null),
-                    "\t",
-                    null,
-                    Arrays.asList("ts",
-                                  "market",
-                                  "quality",
-                                  "placement",
-                                  "placementish",
-                                  "index")
-                )
+            HadoopDruidIndexerConfig.jsonMapper.convertValue(
+                new StringInputRowParser(
+                    new DelimitedParseSpec(
+                        new TimestampSpec("ts", null, null),
+                        new DimensionsSpec(
+                            ImmutableList.of("market", "quality", "placement", "placementish"),
+                            null,
+                            null
+                        ),
+                        "\t",
+                        null,
+                        Arrays.asList(
+                            "ts",
+                            "market",
+                            "quality",
+                            "placement",
+                            "placementish",
+                            "index"
+                        )
+                    )
+                ),
+                Map.class
             ),
             new AggregatorFactory[]{new DoubleSumAggregatorFactory("index", "index")},
             new UniformGranularitySpec(
                 Granularity.DAY,
                 QueryGranularity.NONE,
                 ImmutableList.of(new Interval(interval))
-            )
+            ),
+            HadoopDruidIndexerConfig.jsonMapper
         ),
         new HadoopIOConfig(
             ImmutableMap.<String, Object>of(

diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/DeterminePartitionsJobTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/DeterminePartitionsJobTest.java
@@ -223,18 +223,22 @@ public DeterminePartitionsJobTest(
         new HadoopIngestionSpec(
             new DataSchema(
                 "website",
-                new StringInputRowParser(
-                    new CSVParseSpec(
-                        new TimestampSpec("timestamp", "yyyyMMddHH", null),
-                        new DimensionsSpec(ImmutableList.of("host", "country"), null, null),
-                        null,
-                        ImmutableList.of("timestamp", "host", "country", "visited_num")
-                    )
+                HadoopDruidIndexerConfig.jsonMapper.convertValue(
+                    new StringInputRowParser(
+                        new CSVParseSpec(
+                            new TimestampSpec("timestamp", "yyyyMMddHH", null),
+                            new DimensionsSpec(ImmutableList.of("host", "country"), null, null),
+                            null,
+                            ImmutableList.of("timestamp", "host", "country", "visited_num")
+                        )
+                    ),
+                    Map.class
                 ),
                 new AggregatorFactory[]{new LongSumAggregatorFactory("visited_num", "visited_num")},
                 new UniformGranularitySpec(
                     Granularity.DAY, QueryGranularity.NONE, ImmutableList.of(new Interval(interval))
-                )
+                ),
+                HadoopDruidIndexerConfig.jsonMapper
             ),
             new HadoopIOConfig(
                 ImmutableMap.<String, Object>of(

diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/HadoopDruidIndexerConfigTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/HadoopDruidIndexerConfigTest.java
@@ -17,6 +17,7 @@
 
 package io.druid.indexer;
 
+import com.fasterxml.jackson.databind.InjectableValues;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableList;
@@ -45,7 +46,11 @@
  */
 public class HadoopDruidIndexerConfigTest
 {
-  private static final ObjectMapper jsonMapper = new DefaultObjectMapper();
+  private static final ObjectMapper jsonMapper;
+  static {
+    jsonMapper = new DefaultObjectMapper();
+    jsonMapper.setInjectableValues(new InjectableValues.Std().addValue(ObjectMapper.class, jsonMapper));
+  }
 
   public static <T> T jsonReadWriteRead(String s, Class<T> klass)
   {
@@ -175,12 +180,17 @@ public void testHashedBucketSelection()
 
     HadoopIngestionSpec spec = new HadoopIngestionSpec(
         new DataSchema(
-            "foo", null, new AggregatorFactory[0], new UniformGranularitySpec(
-            Granularity.MINUTE,
-            QueryGranularity.MINUTE,
-            ImmutableList.of(new Interval("2010-01-01/P1D"))
-        )
-        ), new HadoopIOConfig(ImmutableMap.<String, Object>of("paths", "bar", "type", "static"), null, null),
+            "foo",
+            null,
+            new AggregatorFactory[0],
+            new UniformGranularitySpec(
+                Granularity.MINUTE,
+                QueryGranularity.MINUTE,
+                ImmutableList.of(new Interval("2010-01-01/P1D"))
+            ),
+            jsonMapper
+        ),
+        new HadoopIOConfig(ImmutableMap.<String, Object>of("paths", "bar", "type", "static"), null, null),
         new HadoopTuningConfig(
             null,
             null,

diff --git a/indexing-hadoop/src/test/java/io/druid/indexer/HadoopIngestionSpecTest.java b/indexing-hadoop/src/test/java/io/druid/indexer/HadoopIngestionSpecTest.java
@@ -17,23 +17,28 @@
 
 package io.druid.indexer;
 
+import com.fasterxml.jackson.databind.InjectableValues;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.base.Throwables;
 import com.google.common.collect.Lists;
 import io.druid.indexer.partitions.HashedPartitionsSpec;
-import io.druid.metadata.MetadataStorageConnectorConfig;
 import io.druid.indexer.partitions.PartitionsSpec;
 import io.druid.indexer.partitions.SingleDimensionPartitionsSpec;
 import io.druid.indexer.updater.MetadataStorageUpdaterJobSpec;
 import io.druid.jackson.DefaultObjectMapper;
+import io.druid.metadata.MetadataStorageConnectorConfig;
 import io.druid.segment.indexing.granularity.UniformGranularitySpec;
 import org.joda.time.Interval;
 import org.junit.Assert;
 import org.junit.Test;
 
 public class HadoopIngestionSpecTest
 {
-  private static final ObjectMapper jsonMapper = new DefaultObjectMapper();
+  private static final ObjectMapper jsonMapper;
+  static {
+    jsonMapper = new DefaultObjectMapper();
+    jsonMapper.setInjectableValues(new InjectableValues.Std().addValue(ObjectMapper.class, jsonMapper));
+  }
 
   @Test
   public void testGranularitySpec()

diff --git a/...c/test/java/io/druid/indexer/HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest.java b/...c/test/java/io/druid/indexer/HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest.java
@@ -19,6 +19,9 @@
 
 package io.druid.indexer;
 
+import com.fasterxml.jackson.databind.BeanProperty;
+import com.fasterxml.jackson.databind.DeserializationContext;
+import com.fasterxml.jackson.databind.InjectableValues;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
@@ -50,7 +53,15 @@ public class HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest
   private final String testDatasource = "test";
   private final Interval testDatasourceInterval = new Interval("1970/3000");
   private final Interval testDatasourceIntervalPartial = new Interval("2050/3000");
-  private final ObjectMapper jsonMapper = new DefaultObjectMapper();
+  private final ObjectMapper jsonMapper;
+
+  public HadoopIngestionSpecUpdateDatasourcePathSpecSegmentsTest()
+  {
+    jsonMapper = new DefaultObjectMapper();
+    jsonMapper.setInjectableValues(
+        new InjectableValues.Std().addValue(ObjectMapper.class, jsonMapper)
+    );
+  }
 
   private static final DataSegment SEGMENT = new DataSegment(
       "test1",
@@ -155,7 +166,8 @@ private HadoopDruidIndexerConfig testRunUpdateSegmentListIfDatasourcePathSpecIsU
                 ImmutableList.of(
                     new Interval("2010-01-01/P1D")
                 )
-            )
+            ),
+            jsonMapper
         ),
         new HadoopIOConfig(
             jsonMapper.convertValue(datasourcePathSpec, Map.class),
-Original file line number
+Diff line change
@@ Expand Up / @@ -103,7 +103,8 @@ public void configure(Binder binder) @@
                         binder, Key.get(DruidNode.class, Self.class), new DruidNode("hadoop-indexer", null, null)
                     );
                   }
-                }
+                },
+                new IndexingHadoopModule()
             )
         );
         jsonMapper = injector.getInstance(ObjectMapper.class);
@@ Expand Down @@