Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into pin-pages
Browse files Browse the repository at this point in the history
  • Loading branch information
JoshRosen committed Jan 11, 2016
2 parents 76cfebd + b313bad commit 7265784
Show file tree
Hide file tree
Showing 138 changed files with 1,363 additions and 2,341 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ dev/create-release/*final
spark-*-bin-*.tgz
unit-tests.log
/lib/
ec2/lib/
rat-results.txt
scalastyle.txt
scalastyle-output.xml
Expand Down
1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ exportMethods("%in%",
"count",
"countDistinct",
"crc32",
"hash",
"cume_dist",
"date_add",
"date_format",
Expand Down
20 changes: 20 additions & 0 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,26 @@ setMethod("crc32",
column(jc)
})

#' hash
#'
#' Calculates the hash code of given columns, and returns the result as a int column.
#'
#' @rdname hash
#' @name hash
#' @family misc_funcs
#' @export
#' @examples \dontrun{hash(df$c)}
setMethod("hash",
signature(x = "Column"),
function(x, ...) {
jcols <- lapply(list(x, ...), function (x) {
stopifnot(class(x) == "Column")
x@jc
})
jc <- callJStatic("org.apache.spark.sql.functions", "hash", jcols)
column(jc)
})

#' dayofmonth
#'
#' Extracts the day of the month as an integer from a given date/timestamp/string.
Expand Down
4 changes: 4 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,10 @@ setGeneric("countDistinct", function(x, ...) { standardGeneric("countDistinct")
#' @export
setGeneric("crc32", function(x) { standardGeneric("crc32") })

#' @rdname hash
#' @export
setGeneric("hash", function(x, ...) { standardGeneric("hash") })

#' @rdname cume_dist
#' @export
setGeneric("cume_dist", function(x) { standardGeneric("cume_dist") })
Expand Down
2 changes: 1 addition & 1 deletion R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -922,7 +922,7 @@ test_that("column functions", {
c <- column("a")
c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
c3 <- cosh(c) + count(c) + crc32(c) + exp(c)
c3 <- cosh(c) + count(c) + crc32(c) + hash(c) + exp(c)
c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
c5 <- hour(c) + initcap(c) + last(c) + last_day(c) + length(c)
c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi
This implementation is non-blocking, asynchronously handling the
results of each job and triggering the next job using callbacks on futures.
*/
def continue(partsScanned: Long)(implicit jobSubmitter: JobSubmitter) : Future[Seq[T]] =
def continue(partsScanned: Int)(implicit jobSubmitter: JobSubmitter): Future[Seq[T]] =
if (results.size >= num || partsScanned >= totalParts) {
Future.successful(results.toSeq)
} else {
Expand All @@ -99,7 +99,7 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi
}

val left = num - results.size
val p = partsScanned.toInt until math.min(partsScanned + numPartsToTry, totalParts).toInt
val p = partsScanned.until(math.min(partsScanned + numPartsToTry, totalParts).toInt)

val buf = new Array[Array[T]](p.size)
self.context.setCallSite(callSite)
Expand All @@ -109,13 +109,13 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi
p,
(index: Int, data: Array[T]) => buf(index) = data,
Unit)
job.flatMap {_ =>
job.flatMap { _ =>
buf.foreach(results ++= _.take(num - results.size))
continue(partsScanned + p.size)
}
}

new ComplexFutureAction[Seq[T]](continue(0L)(_))
new ComplexFutureAction[Seq[T]](continue(0)(_))
}

/**
Expand Down
4 changes: 2 additions & 2 deletions core/src/main/scala/org/apache/spark/rdd/RDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1190,7 +1190,7 @@ abstract class RDD[T: ClassTag](
} else {
val buf = new ArrayBuffer[T]
val totalParts = this.partitions.length
var partsScanned = 0L
var partsScanned = 0
while (buf.size < num && partsScanned < totalParts) {
// The number of partitions to try in this iteration. It is ok for this number to be
// greater than totalParts because we actually cap it at totalParts in runJob.
Expand All @@ -1209,7 +1209,7 @@ abstract class RDD[T: ClassTag](
}

val left = num - buf.size
val p = partsScanned.toInt until math.min(partsScanned + numPartsToTry, totalParts).toInt
val p = partsScanned.until(math.min(partsScanned + numPartsToTry, totalParts).toInt)
val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, p)

res.foreach(buf ++= _.take(num - buf.size))
Expand Down
4 changes: 4 additions & 0 deletions core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,10 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {
assert(nums.take(501) === (1 to 501).toArray)
assert(nums.take(999) === (1 to 999).toArray)
assert(nums.take(1000) === (1 to 999).toArray)

nums = sc.parallelize(1 to 2, 2)
assert(nums.take(2147483638).size === 2)
assert(nums.takeAsync(2147483638).get.size === 2)
}

test("top with predefined ordering") {
Expand Down
3 changes: 0 additions & 3 deletions dev/create-release/release-tag.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,6 @@ git commit -a -m "Preparing Spark release $RELEASE_TAG"
echo "Creating tag $RELEASE_TAG at the head of $GIT_BRANCH"
git tag $RELEASE_TAG

# TODO: It would be nice to do some verifications here
# i.e. check whether ec2 scripts have the new version

# Create next version
$MVN versions:set -DnewVersion=$NEXT_VERSION | grep -v "no value" # silence logs
git commit -a -m "Preparing development version $NEXT_VERSION"
Expand Down
1 change: 0 additions & 1 deletion dev/create-release/releaseutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ def get_commits(tag):
"build": CORE_COMPONENT,
"deploy": CORE_COMPONENT,
"documentation": CORE_COMPONENT,
"ec2": "EC2",
"examples": CORE_COMPONENT,
"graphx": "GraphX",
"input/output": CORE_COMPONENT,
Expand Down
8 changes: 4 additions & 4 deletions dev/deps/spark-deps-hadoop-2.2
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,13 @@ hadoop-yarn-server-web-proxy-2.2.0.jar
httpclient-4.3.2.jar
httpcore-4.3.2.jar
ivy-2.4.0.jar
jackson-annotations-2.4.4.jar
jackson-core-2.4.4.jar
jackson-annotations-2.5.3.jar
jackson-core-2.5.3.jar
jackson-core-asl-1.9.13.jar
jackson-databind-2.4.4.jar
jackson-databind-2.5.3.jar
jackson-jaxrs-1.9.13.jar
jackson-mapper-asl-1.9.13.jar
jackson-module-scala_2.10-2.4.4.jar
jackson-module-scala_2.10-2.5.3.jar
jackson-xc-1.9.13.jar
janino-2.7.8.jar
jansi-1.4.jar
Expand Down
8 changes: 4 additions & 4 deletions dev/deps/spark-deps-hadoop-2.3
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,13 @@ hadoop-yarn-server-web-proxy-2.3.0.jar
httpclient-4.3.2.jar
httpcore-4.3.2.jar
ivy-2.4.0.jar
jackson-annotations-2.4.4.jar
jackson-core-2.4.4.jar
jackson-annotations-2.5.3.jar
jackson-core-2.5.3.jar
jackson-core-asl-1.9.13.jar
jackson-databind-2.4.4.jar
jackson-databind-2.5.3.jar
jackson-jaxrs-1.9.13.jar
jackson-mapper-asl-1.9.13.jar
jackson-module-scala_2.10-2.4.4.jar
jackson-module-scala_2.10-2.5.3.jar
jackson-xc-1.9.13.jar
janino-2.7.8.jar
jansi-1.4.jar
Expand Down
8 changes: 4 additions & 4 deletions dev/deps/spark-deps-hadoop-2.4
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,13 @@ hadoop-yarn-server-web-proxy-2.4.0.jar
httpclient-4.3.2.jar
httpcore-4.3.2.jar
ivy-2.4.0.jar
jackson-annotations-2.4.4.jar
jackson-core-2.4.4.jar
jackson-annotations-2.5.3.jar
jackson-core-2.5.3.jar
jackson-core-asl-1.9.13.jar
jackson-databind-2.4.4.jar
jackson-databind-2.5.3.jar
jackson-jaxrs-1.9.13.jar
jackson-mapper-asl-1.9.13.jar
jackson-module-scala_2.10-2.4.4.jar
jackson-module-scala_2.10-2.5.3.jar
jackson-xc-1.9.13.jar
janino-2.7.8.jar
jansi-1.4.jar
Expand Down
8 changes: 4 additions & 4 deletions dev/deps/spark-deps-hadoop-2.6
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,13 @@ htrace-core-3.0.4.jar
httpclient-4.3.2.jar
httpcore-4.3.2.jar
ivy-2.4.0.jar
jackson-annotations-2.4.4.jar
jackson-core-2.4.4.jar
jackson-annotations-2.5.3.jar
jackson-core-2.5.3.jar
jackson-core-asl-1.9.13.jar
jackson-databind-2.4.4.jar
jackson-databind-2.5.3.jar
jackson-jaxrs-1.9.13.jar
jackson-mapper-asl-1.9.13.jar
jackson-module-scala_2.10-2.4.4.jar
jackson-module-scala_2.10-2.5.3.jar
jackson-xc-1.9.13.jar
janino-2.7.8.jar
jansi-1.4.jar
Expand Down
2 changes: 1 addition & 1 deletion dev/lint-python
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport"
PATHS_TO_CHECK="./python/pyspark/ ./examples/src/main/python/ ./dev/sparktestsupport"
PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py ./dev/run-tests-jenkins.py"
PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
Expand Down
9 changes: 0 additions & 9 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,15 +406,6 @@ def contains_file(self, filename):
should_run_build_tests=True
)

ec2 = Module(
name="ec2",
dependencies=[],
source_file_regexes=[
"ec2/",
]
)


yarn = Module(
name="yarn",
dependencies=[],
Expand Down
17 changes: 4 additions & 13 deletions dev/test-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,19 +70,10 @@ $MVN -q versions:set -DnewVersion=$TEMP_VERSION -DgenerateBackupPoms=false > /de
# Generate manifests for each Hadoop profile:
for HADOOP_PROFILE in "${HADOOP_PROFILES[@]}"; do
echo "Performing Maven install for $HADOOP_PROFILE"
$MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar install:install -q \
-pl '!assembly' \
-pl '!examples' \
-pl '!external/flume-assembly' \
-pl '!external/kafka-assembly' \
-pl '!external/twitter' \
-pl '!external/flume' \
-pl '!external/mqtt' \
-pl '!external/mqtt-assembly' \
-pl '!external/zeromq' \
-pl '!external/kafka' \
-pl '!tags' \
-DskipTests
$MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar jar:test-jar install:install -q

echo "Performing Maven validate for $HADOOP_PROFILE"
$MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE validate -q

echo "Generating dependency manifest for $HADOOP_PROFILE"
mkdir -p dev/pr-deps
Expand Down
2 changes: 0 additions & 2 deletions docs/_layouts/global.html
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,6 @@
<li><a href="spark-standalone.html">Spark Standalone</a></li>
<li><a href="running-on-mesos.html">Mesos</a></li>
<li><a href="running-on-yarn.html">YARN</a></li>
<li class="divider"></li>
<li><a href="ec2-scripts.html">Amazon EC2</a></li>
</ul>
</li>

Expand Down
2 changes: 0 additions & 2 deletions docs/cluster-overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ The system currently supports three cluster managers:
and service applications.
* [Hadoop YARN](running-on-yarn.html) -- the resource manager in Hadoop 2.

In addition, Spark's [EC2 launch scripts](ec2-scripts.html) make it easy to launch a standalone
cluster on Amazon EC2.

# Submitting Applications

Expand Down
Loading

0 comments on commit 7265784

Please sign in to comment.