-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-16711] YarnShuffleService doesn't re-init properly on YARN rolling upgrade #14718
Changes from 4 commits
f0a5c56
6db1ad6
8b4ba9d
2643d56
0e39687
c4f58e8
5319981
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.network.util; | ||
|
||
import com.fasterxml.jackson.annotation.JsonCreator; | ||
import com.fasterxml.jackson.annotation.JsonProperty; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
import com.google.common.base.Charsets; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
import org.fusesource.leveldbjni.JniDBFactory; | ||
import org.fusesource.leveldbjni.internal.NativeDB; | ||
import org.iq80.leveldb.DB; | ||
import org.iq80.leveldb.Options; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.File; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: java should come before others. |
||
import java.io.IOException; | ||
|
||
/** | ||
* LevelDB utility class available in the network package. | ||
*/ | ||
public class LevelDBProvider { | ||
private static final Logger logger = LoggerFactory.getLogger(LevelDBProvider.class); | ||
|
||
public static DB initLevelDB(File dbFile, StoreVersion version, ObjectMapper mapper) throws | ||
IOException { | ||
DB tmpDb = null; | ||
if (dbFile != null) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need to change now, but in Java at least I tend to avoid the extra level of indentation.
|
||
Options options = new Options(); | ||
options.createIfMissing(false); | ||
options.logger(new LevelDBLogger()); | ||
try { | ||
tmpDb = JniDBFactory.factory.open(dbFile, options); | ||
} catch (NativeDB.DBException e) { | ||
if (e.isNotFound() || e.getMessage().contains(" does not exist ")) { | ||
logger.info("Creating state database at " + dbFile); | ||
options.createIfMissing(true); | ||
try { | ||
tmpDb = JniDBFactory.factory.open(dbFile, options); | ||
} catch (NativeDB.DBException dbExc) { | ||
throw new IOException("Unable to create state store", dbExc); | ||
} | ||
} else { | ||
// the leveldb file seems to be corrupt somehow. Lets just blow it away and create a new | ||
// one, so we can keep processing new apps | ||
logger.error("error opening leveldb file {}. Creating new file, will not be able to " + | ||
"recover state for existing applications", dbFile, e); | ||
if (dbFile.isDirectory()) { | ||
for (File f : dbFile.listFiles()) { | ||
if (!f.delete()) { | ||
logger.warn("error deleting {}", f.getPath()); | ||
} | ||
} | ||
} | ||
if (!dbFile.delete()) { | ||
logger.warn("error deleting {}", dbFile.getPath()); | ||
} | ||
options.createIfMissing(true); | ||
try { | ||
tmpDb = JniDBFactory.factory.open(dbFile, options); | ||
} catch (NativeDB.DBException dbExc) { | ||
throw new IOException("Unable to create state store", dbExc); | ||
} | ||
|
||
} | ||
} | ||
// if there is a version mismatch, we throw an exception, which means the service is unusable | ||
checkVersion(tmpDb, version, mapper); | ||
} | ||
return tmpDb; | ||
} | ||
|
||
private static class LevelDBLogger implements org.iq80.leveldb.Logger { | ||
private static final Logger LOG = LoggerFactory.getLogger(LevelDBLogger.class); | ||
|
||
@Override | ||
public void log(String message) { | ||
LOG.info(message); | ||
} | ||
} | ||
|
||
/** | ||
* Simple major.minor versioning scheme. Any incompatible changes should be across major | ||
* versions. Minor version differences are allowed -- meaning we should be able to read | ||
* dbs that are either earlier *or* later on the minor version. | ||
*/ | ||
public static void checkVersion(DB db, StoreVersion newversion, ObjectMapper mapper) throws | ||
IOException { | ||
byte[] bytes = db.get(StoreVersion.KEY); | ||
if (bytes == null) { | ||
storeVersion(db, newversion, mapper); | ||
} else { | ||
StoreVersion version = mapper.readValue(bytes, StoreVersion.class); | ||
if (version.major != newversion.major) { | ||
throw new IOException("cannot read state DB with version " + version + ", incompatible " + | ||
"with current version " + newversion); | ||
} | ||
storeVersion(db, newversion, mapper); | ||
} | ||
} | ||
|
||
public static void storeVersion(DB db, StoreVersion version, ObjectMapper mapper) | ||
throws IOException { | ||
db.put(StoreVersion.KEY, mapper.writeValueAsBytes(version)); | ||
} | ||
|
||
public static class StoreVersion { | ||
|
||
final static byte[] KEY = "StoreVersion".getBytes(Charsets.UTF_8); | ||
|
||
public final int major; | ||
public final int minor; | ||
|
||
@JsonCreator | ||
public StoreVersion(@JsonProperty("major") int major, @JsonProperty("minor") int minor) { | ||
this.major = major; | ||
this.minor = minor; | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if (this == o) return true; | ||
if (o == null || getClass() != o.getClass()) return false; | ||
|
||
StoreVersion that = (StoreVersion) o; | ||
|
||
return major == that.major && minor == that.minor; | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
int result = major; | ||
result = 31 * result + minor; | ||
return result; | ||
} | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tgravescs Why not move LevelDBProvider and this dependency to
common/network-shuffle/
? It's not used incommon/network-common/
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you mean? looking below its used in common/network-common/src/main/java/org/apache/spark/network/util/LevelDBProvider.java
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh maybe I misunderstood, why not you mean move it all there? I think the only reason was it was a utility class that could be used by others. Both network-shuffle and network-yarn use it now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think network-yarn depends on network-shuffle, so the utility class can be put into network-shuffle.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NVM. Just recall that @vanzin may want to use leveldb in the history server.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's sort of orthogonal, though. My code pulls leveldb directly, not transitively through the network libs.