From 5a8303ea1b49e67496d90a8fba59458dc47694e4 Mon Sep 17 00:00:00 2001 From: alexVengrovsk Date: Sun, 16 Aug 2015 08:44:46 +0300 Subject: [PATCH 1/4] Polishing HadoopArchive.java --- .../main/java/org/apache/hadoop/tools/HadoopArchives.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java b/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java index ee148503f10e2..e51f41e597609 100644 --- a/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java +++ b/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java @@ -233,8 +233,8 @@ static class HArchiveInputFormat implements InputFormat //generate input splits from the src file lists public InputSplit[] getSplits(JobConf jconf, int numSplits) throws IOException { - String srcfilelist = jconf.get(SRC_LIST_LABEL, ""); - if ("".equals(srcfilelist)) { + String srcFileList = jconf.get(SRC_LIST_LABEL, ""); + if ("".equals(srcFileList)) { throw new IOException("Unable to get the " + "src file for archive generation."); } @@ -243,7 +243,7 @@ public InputSplit[] getSplits(JobConf jconf, int numSplits) throw new IOException("Invalid size of files to archive"); } //we should be safe since this is set by our own code - Path src = new Path(srcfilelist); + Path src = new Path(srcFileList); FileSystem fs = src.getFileSystem(jconf); FileStatus fstatus = fs.getFileStatus(src); ArrayList splits = new ArrayList(numSplits); From 2b07091301344699a2b01fe1970dfcb93e3c0659 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 16 Aug 2015 17:06:47 +0300 Subject: [PATCH 2/4] getSplits method of HaddopArchives.java improvement --- .../src/main/java/org/apache/hadoop/tools/HadoopArchives.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java b/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java index e51f41e597609..65aa03dde1423 100644 --- a/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java +++ b/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java @@ -262,7 +262,7 @@ public InputSplit[] getSplits(JobConf jconf, int numSplits) // have equals sized data to read and write to. try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, src, jconf)) { while(reader.next(key, value)) { - if (currentCount + key.get() > targetSize && currentCount != 0){ + if (currentCount != 0 && currentCount + key.get() > targetSize){ long size = lastPos - startPos; splits.add(new FileSplit(src, startPos, size, (String[]) null)); remaining = remaining - size; From 499f4252a82a30bac4239a5efdcc56144e34a398 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 16 Aug 2015 21:53:53 +0300 Subject: [PATCH 3/4] HaddopArchives.java polishing --- .../apache/hadoop/tools/HadoopArchives.java | 58 ++++++++----------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java b/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java index 65aa03dde1423..24ab8d5591ed1 100644 --- a/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java +++ b/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java @@ -76,7 +76,7 @@ import com.google.common.base.Charsets; /** - * a archive creation utility. + * This is an archive creation utility. * This class provides methods that can be used * to create hadoop archives. For understanding of * Hadoop archives look at {@link HarFileSystem}. @@ -104,7 +104,7 @@ public class HadoopArchives implements Tool { static final String HAR_REPLICATION_LABEL = NAME + ".replication.factor"; /** the size of the part files that will be created when archiving **/ static final String HAR_PARTSIZE_LABEL = NAME + ".partfile.size"; - + static final String TEST_HADOOP_ARCHIVES_JAR_PATH = "test.hadoop.archives.jar"; /** size of each part file size **/ long partSize = 2 * 1024 * 1024 * 1024l; /** size of blocks in hadoop archives **/ @@ -116,10 +116,13 @@ public class HadoopArchives implements Tool { + " <-archiveName .har> <-p > [-r ]" + " * " + "\n"; - - + private JobConf conf; + public HadoopArchives(Configuration conf) { + setConf(conf); + } + public void setConf(Configuration conf) { if (conf instanceof JobConf) { this.conf = (JobConf) conf; @@ -130,7 +133,7 @@ public void setConf(Configuration conf) { // This is for test purposes since MR2, different from Streaming // here it is not possible to add a JAR to the classpath the tool // will when running the mapreduce job. - String testJar = System.getProperty(TEST_HADOOP_ARCHIVES_JAR_PATH, null); + String testJar = System.getProperty(TEST_HADOOP_ARCHIVES_JAR_PATH); if (testJar != null) { this.conf.setJar(testJar); } @@ -140,10 +143,6 @@ public Configuration getConf() { return this.conf; } - public HadoopArchives(Configuration conf) { - setConf(conf); - } - // check the src paths private static void checkPaths(Configuration conf, List paths) throws IOException { @@ -158,11 +157,11 @@ private static void checkPaths(Configuration conf, List paths) throws /** * this assumes that there are two types of files file/dir * @param fs the input filesystem - * @param fdir the filestatusdir of the path + * @param fdir the filestatusdir of the path * @param out the list of paths output of recursive ls * @throws IOException */ - private void recursivels(FileSystem fs, FileStatusDir fdir, List out) + private void recursiveLs(FileSystem fs, FileStatusDir fdir, List out) throws IOException { if (fdir.getFileStatus().isFile()) { out.add(fdir); @@ -174,7 +173,7 @@ private void recursivels(FileSystem fs, FileStatusDir fdir, List fdir.setChildren(listStatus); for (FileStatus stat: listStatus) { FileStatusDir fstatDir = new FileStatusDir(stat, null); - recursivels(fs, fstatDir, out); + recursiveLs(fs, fstatDir, out); } } } @@ -246,7 +245,7 @@ public InputSplit[] getSplits(JobConf jconf, int numSplits) Path src = new Path(srcFileList); FileSystem fs = src.getFileSystem(jconf); FileStatus fstatus = fs.getFileStatus(src); - ArrayList splits = new ArrayList(numSplits); + ArrayList splits = new ArrayList<>(numSplits); LongWritable key = new LongWritable(); final HarEntry value = new HarEntry(); // the remaining bytes in the file split @@ -289,22 +288,18 @@ public RecordReader getRecordReader(InputSplit split, } private boolean checkValidName(String name) { - Path tmp = new Path(name); - if (tmp.depth() != 1) { - return false; + if(name.endsWith(".har")) { + Path tmp = new Path(name); + if (tmp.depth() == 1) return true; } - if (name.endsWith(".har")) - return true; return false; } private Path largestDepth(List paths) { Path deepest = paths.get(0); - for (Path p: paths) { - if (p.depth() > deepest.depth()) { - deepest = p; - } + for (Path p : paths) { + if (p.depth() > deepest.depth()) deepest = p; } return deepest; } @@ -356,7 +351,7 @@ else if (fullPath.depth() > root.depth()) { private void writeTopLevelDirs(SequenceFile.Writer srcWriter, List paths, Path parentPath) throws IOException { // extract paths from absolute URI's - List justPaths = new ArrayList(); + List justPaths = new ArrayList<>(); for (Path p: paths) { justPaths.add(new Path(p.toUri().getPath())); } @@ -365,15 +360,14 @@ private void writeTopLevelDirs(SequenceFile.Writer srcWriter, * twice and also we need to only add valid child of a path that * are specified the user. */ - TreeMap> allpaths = new TreeMap>(); + TreeMap> allpaths = new TreeMap<>(); /* the largest depth of paths. the max number of times * we need to iterate */ Path deepest = largestDepth(paths); Path root = new Path(Path.SEPARATOR); for (int i = parentPath.depth(); i < deepest.depth(); i++) { - List parents = new ArrayList(); + List parents = new ArrayList<>(); for (Path p: justPaths) { if (p.compareTo(root) == 0){ //do nothing @@ -386,7 +380,7 @@ private void writeTopLevelDirs(SequenceFile.Writer srcWriter, children.add(p.getName()); } else { - HashSet children = new HashSet(); + HashSet children = new HashSet<>(); children.add(p.getName()); allpaths.put(parent.toString(), children); } @@ -523,10 +517,10 @@ void archive(Path parentPath, List srcPaths, // and then write them to the input file // one at a time for (Path src: srcPaths) { - ArrayList allFiles = new ArrayList(); FileStatus fstatus = fs.getFileStatus(src); FileStatusDir fdir = new FileStatusDir(fstatus, null); - recursivels(fs, fdir, allFiles); + List allFiles = new ArrayList<>(); + recursiveLs(fs, fdir, allFiles); for (FileStatusDir statDir: allFiles) { FileStatus stat = statDir.getFileStatus(); long len = stat.isDirectory()? 0:stat.getLen(); @@ -869,7 +863,7 @@ public int run(String[] args) throws Exception { } // Remaining args args = commandLine.getArgs(); - List srcPaths = new ArrayList(); + List srcPaths = new ArrayList<>(); Path destPath = null; //read the rest of the paths for (int i = 0; i < args.length; i++) { @@ -899,7 +893,7 @@ public int run(String[] args) throws Exception { srcPaths.add(parentPath); } // do a glob on the srcPaths and then pass it on - List globPaths = new ArrayList(); + List globPaths = new ArrayList<>(); for (Path p: srcPaths) { FileSystem fs = p.getFileSystem(getConf()); FileStatus[] statuses = fs.globStatus(p); @@ -923,8 +917,6 @@ public int run(String[] args) throws Exception { return 0; } - static final String TEST_HADOOP_ARCHIVES_JAR_PATH = "test.hadoop.archives.jar"; - /** the main functions **/ public static void main(String[] args) { JobConf job = new JobConf(HadoopArchives.class); From 00f6113052508e33fa99cf1cfd54070acf388fd5 Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 16 Aug 2015 23:12:22 +0300 Subject: [PATCH 4/4] HaddopArchives.java polishing mainly in writeTopLevelDirs method --- .../apache/hadoop/tools/HadoopArchives.java | 45 +++++++------------ 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java b/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java index 24ab8d5591ed1..cd152a1886d37 100644 --- a/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java +++ b/hadoop-tools/hadoop-archives/src/main/java/org/apache/hadoop/tools/HadoopArchives.java @@ -84,7 +84,6 @@ public class HadoopArchives implements Tool { public static final int VERSION = 3; private static final Log LOG = LogFactory.getLog(HadoopArchives.class); - private static final String NAME = "har"; private static final String ARCHIVE_NAME = "archiveName"; private static final String REPLICATION = "r"; @@ -111,12 +110,10 @@ public class HadoopArchives implements Tool { long blockSize = 512 * 1024 * 1024l; /** the desired replication degree; default is 3 **/ short repl = 3; - private static final String usage = "archive" + " <-archiveName .har> <-p > [-r ]" + " * " + "\n"; - private JobConf conf; public HadoopArchives(Configuration conf) { @@ -171,7 +168,7 @@ private void recursiveLs(FileSystem fs, FileStatusDir fdir, List out.add(fdir); FileStatus[] listStatus = fs.listStatus(fdir.getFileStatus().getPath()); fdir.setChildren(listStatus); - for (FileStatus stat: listStatus) { + for (FileStatus stat : listStatus) { FileStatusDir fstatDir = new FileStatusDir(stat, null); recursiveLs(fs, fstatDir, out); } @@ -288,7 +285,7 @@ public RecordReader getRecordReader(InputSplit split, } private boolean checkValidName(String name) { - if(name.endsWith(".har")) { + if (name.endsWith(".har")) { Path tmp = new Path(name); if (tmp.depth() == 1) return true; } @@ -315,13 +312,15 @@ private Path relPathToRoot(Path fullPath, Path root) { // rather than just using substring // so that we do not break sometime later final Path justRoot = new Path(Path.SEPARATOR); - if (fullPath.depth() == root.depth()) { + final int fullPathDepth = fullPath.depth(); + final int rootDepth = root.depth(); + if (fullPathDepth == rootDepth) { return justRoot; } - else if (fullPath.depth() > root.depth()) { + else if (fullPathDepth > rootDepth) { Path retPath = new Path(fullPath.getName()); Path parent = fullPath.getParent(); - for (int i=0; i < (fullPath.depth() - root.depth() -1); i++) { + for (int i=0; i < (fullPathDepth - rootDepth -1); i++) { retPath = new Path(parent.getName(), retPath); parent = parent.getParent(); } @@ -360,7 +359,7 @@ private void writeTopLevelDirs(SequenceFile.Writer srcWriter, * twice and also we need to only add valid child of a path that * are specified the user. */ - TreeMap> allpaths = new TreeMap<>(); + TreeMap> allPaths = new TreeMap<>(); /* the largest depth of paths. the max number of times * we need to iterate */ @@ -368,29 +367,19 @@ private void writeTopLevelDirs(SequenceFile.Writer srcWriter, Path root = new Path(Path.SEPARATOR); for (int i = parentPath.depth(); i < deepest.depth(); i++) { List parents = new ArrayList<>(); - for (Path p: justPaths) { - if (p.compareTo(root) == 0){ - //do nothing - } - else { - Path parent = p.getParent(); - if (null != parent) { - if (allpaths.containsKey(parent.toString())) { - HashSet children = allpaths.get(parent.toString()); - children.add(p.getName()); - } - else { - HashSet children = new HashSet<>(); - children.add(p.getName()); - allpaths.put(parent.toString(), children); - } - parents.add(parent); - } + for (Path p : justPaths) { + Path parent = p.getParent(); + if (p.compareTo(root) != 0 && parent != null) { + final String patentAsString = parent.toString(); + HashSet children = new HashSet<>(); + children.add(p.getName()); + allPaths.put(patentAsString, children); + parents.add(parent); } } justPaths = parents; } - Set>> keyVals = allpaths.entrySet(); + Set>> keyVals = allPaths.entrySet(); for (Map.Entry> entry : keyVals) { final Path relPath = relPathToRoot(new Path(entry.getKey()), parentPath); if (relPath != null) {