Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-12737]Decrease the redundant activeIds sent to remote mirrors in "aggregateMessagesWithActiveSet" #10676

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@ import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.rdd._
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx.impl.RoutingTablePartition
import org.apache.spark.graphx.impl.ShippableVertexPartition
import org.apache.spark.graphx.impl.VertexAttributeBlock
import org.apache.spark.graphx.impl.VertexRDDImpl
import org.apache.spark.rdd._
import org.apache.spark.storage.StorageLevel

/**
* Extends `RDD[(VertexId, VD)]` by ensuring that there is only one entry for each vertex and by
Expand Down Expand Up @@ -54,8 +55,8 @@ import org.apache.spark.storage.StorageLevel
* @tparam VD the vertex attribute associated with each vertex in the set.
*/
abstract class VertexRDD[VD](
sc: SparkContext,
deps: Seq[Dependency[_]]) extends RDD[(VertexId, VD)](sc, deps) {
@transient sc: SparkContext,
@transient deps: Seq[Dependency[_]]) extends RDD[(VertexId, VD)](sc, deps) {

implicit protected def vdTag: ClassTag[VD]

Expand Down Expand Up @@ -255,7 +256,8 @@ abstract class VertexRDD[VD](
shipSrc: Boolean, shipDst: Boolean): RDD[(PartitionID, VertexAttributeBlock[VD])]

/** Generates an RDD of vertex IDs suitable for shipping to the edge partitions. */
private[graphx] def shipVertexIds(): RDD[(PartitionID, Array[VertexId])]
private[graphx] def shipVertexIds(
shipSrc: Boolean, shipDst: Boolean): RDD[(PartitionID, Array[VertexId])]

} // end of VertexRDD

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,12 @@ import scala.reflect.{classTag, ClassTag}

import org.apache.spark.HashPartitioner
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.{RDD, ShuffledRDD}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.graphx._
import org.apache.spark.graphx.impl.GraphImpl._
import org.apache.spark.graphx.util.BytecodeUtils
import org.apache.spark.rdd.{RDD, ShuffledRDD}
import org.apache.spark.storage.StorageLevel


/**
* An implementation of [[org.apache.spark.graphx.Graph]] to support computation on graphs.
Expand Down Expand Up @@ -222,13 +223,14 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
// For each vertex, replicate its attribute only to partitions where it is
// in the relevant position in an edge.
replicatedVertexView.upgrade(vertices, tripletFields.useSrc, tripletFields.useDst)
val activeDirectionOpt = activeSetOpt.map(_._2)

val view = activeSetOpt match {
case Some((activeSet, _)) =>
replicatedVertexView.withActiveSet(activeSet)
replicatedVertexView.withActiveSet(activeSet, activeDirectionOpt)
case None =>
replicatedVertexView
}
val activeDirectionOpt = activeSetOpt.map(_._2)

// Map and combine.
val preAgg = view.edges.partitionsRDD.mapPartitions(_.flatMap {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ package org.apache.spark.graphx.impl
import scala.reflect.{classTag, ClassTag}

import org.apache.spark.SparkContext._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

import org.apache.spark.graphx._

/**
* Manages shipping vertex attributes to the edge partitions of an
* [[org.apache.spark.graphx.EdgeRDD]]. Vertex attributes may be partially shipped to construct a
Expand Down Expand Up @@ -85,12 +86,14 @@ class ReplicatedVertexView[VD: ClassTag, ED: ClassTag](
* vertex ids present in `actives`. This ships a vertex id to all edge partitions where it is
* referenced, ignoring the attribute shipping level.
*/
def withActiveSet(actives: VertexRDD[_]): ReplicatedVertexView[VD, ED] = {
val shippedActives = actives.shipVertexIds()
def withActiveSetPosition(actives: VertexRDD[_], useSrc: Boolean, useDst: Boolean):
ReplicatedVertexView[VD, ED] = {
val shippedActives = actives.shipVertexIds(useSrc, useDst)
.setName("ReplicatedVertexView.withActiveSet - shippedActives (broadcast)")
.partitionBy(edges.partitioner.get)

val newEdges = edges.withPartitionsRDD(edges.partitionsRDD.zipPartitions(shippedActives) {
val newEdges = edges.withPartitionsRDD(edges.partitionsRDD
.zipPartitions(shippedActives) {
(ePartIter, shippedActivesIter) => ePartIter.map {
case (pid, edgePartition) =>
(pid, edgePartition.withActiveSet(shippedActivesIter.flatMap(_._2.iterator)))
Expand All @@ -99,6 +102,24 @@ class ReplicatedVertexView[VD: ClassTag, ED: ClassTag](
new ReplicatedVertexView(newEdges, hasSrcId, hasDstId)
}


def withActiveSet(actives: VertexRDD[_], edgeDir: Option[EdgeDirection]):
ReplicatedVertexView[VD, ED] = {
edgeDir match {
case Some(EdgeDirection.Both) =>
withActiveSetPosition(actives, true, true)
case Some(EdgeDirection.Either) =>
withActiveSetPosition(actives, true, true)
case Some(EdgeDirection.Out) =>
withActiveSetPosition(actives, true, false)
case Some(EdgeDirection.In) =>
withActiveSetPosition(actives, false, true)
case _ =>
this
}
}


/**
* Return a new `ReplicatedVertexView` where vertex attributes in edge partition are updated using
* `updates`. This ships a vertex attribute only to the edge partitions where it is in the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ package org.apache.spark.graphx.impl

import scala.reflect.ClassTag

import org.apache.spark.util.collection.{BitSet, PrimitiveVector}

import org.apache.spark.graphx._
import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap
import org.apache.spark.util.collection.{BitSet, PrimitiveVector}

/** Stores vertex attributes to ship to an edge partition. */
private[graphx]
Expand Down Expand Up @@ -134,11 +135,12 @@ class ShippableVertexPartition[VD: ClassTag](
* contains the visible vertex ids from the current partition that are referenced in the edge
* partition.
*/
def shipVertexIds(): Iterator[(PartitionID, Array[VertexId])] = {
def shipVertexIds(shipSrc: Boolean, shipDst: Boolean):
Iterator[(PartitionID, Array[VertexId])] = {
Iterator.tabulate(routingTable.numEdgePartitions) { pid =>
val vids = new PrimitiveVector[VertexId](routingTable.partitionSize(pid))
var i = 0
routingTable.foreachWithinEdgePartition(pid, true, true) { vid =>
routingTable.foreachWithinEdgePartition(pid, shipSrc, shipDst) { vid =>
if (isDefined(vid)) {
vids += vid
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@ import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.graphx._
import org.apache.spark.rdd._
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx._

class VertexRDDImpl[VD] private[graphx] (
@transient val partitionsRDD: RDD[ShippableVertexPartition[VD]],
val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
Expand Down Expand Up @@ -246,8 +247,9 @@ class VertexRDDImpl[VD] private[graphx] (
partitionsRDD.mapPartitions(_.flatMap(_.shipVertexAttributes(shipSrc, shipDst)))
}

override private[graphx] def shipVertexIds(): RDD[(PartitionID, Array[VertexId])] = {
partitionsRDD.mapPartitions(_.flatMap(_.shipVertexIds()))
override private[graphx] def shipVertexIds(shipSrc: Boolean, shipDst: Boolean):
RDD[(PartitionID, Array[VertexId])] = {
partitionsRDD.mapPartitions(_.flatMap(_.shipVertexIds(shipSrc, shipDst)))
}

}