From 5e35d39e0f26db3b669bc2318bd7b3f9f6c5fc50 Mon Sep 17 00:00:00 2001 From: Ankur Dave Date: Sun, 12 Jan 2014 13:10:53 -0800 Subject: [PATCH] Add PageRank example and data --- docs/graphx-programming-guide.md | 32 ++++++++++++++++++- graphx/data/followers.txt | 12 +++++++ graphx/data/users.txt | 6 ++++ .../apache/spark/graphx/lib/PageRank.scala | 2 +- 4 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 graphx/data/followers.txt create mode 100644 graphx/data/users.txt diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 7f93754edb..52668b07c8 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -470,10 +470,40 @@ things to worry about.) # Graph Algorithms -This section should describe the various algorithms and how they are used. +GraphX includes a set of graph algorithms in to simplify analytics. The algorithms are contained in the `org.apache.spark.graphx.lib` package and can be accessed directly as methods on `Graph` via an implicit conversion to [`Algorithms`][Algorithms]. This section describes the algorithms and how they are used. + +[Algorithms]: api/graphx/index.html#org.apache.spark.graphx.lib.Algorithms ## PageRank +PageRank measures the importance of each vertex in a graph, assuming an edge from *u* to *v* represents an endorsement of *v*'s importance by *u*. For example, if a Twitter user is followed by many others, the user will be ranked highly. + +Spark includes an example social network dataset that we can run PageRank on. A set of users is given in `graphx/data/users.txt`, and a set of relationships between users is given in `graphx/data/followers.txt`. We can compute the PageRank of each user as follows: + +{% highlight scala %} +// Load the implicit conversion to Algorithms +import org.apache.spark.graphx.lib._ +// Load the datasets into a graph +val users = sc.textFile("graphx/data/users.txt").map { line => + val fields = line.split("\\s+") + (fields(0).toLong, fields(1)) +} +val followers = sc.textFile("graphx/data/followers.txt").map { line => + val fields = line.split("\\s+") + Edge(fields(0).toLong, fields(1).toLong, 1) +} +val graph = Graph(users, followers) +// Run PageRank +val ranks = graph.pageRank(0.0001).vertices +// Join the ranks with the usernames +val ranksByUsername = users.leftOuterJoin(ranks).map { + case (id, (username, rankOpt)) => (username, rankOpt.getOrElse(0.0)) +} +// Print the result +println(ranksByUsername.collect().mkString("\n")) +{% endhighlight %} + + ## Connected Components ## Shortest Path diff --git a/graphx/data/followers.txt b/graphx/data/followers.txt new file mode 100644 index 0000000000..0f46d80806 --- /dev/null +++ b/graphx/data/followers.txt @@ -0,0 +1,12 @@ +2 1 +3 1 +4 1 +6 1 +3 2 +6 2 +7 2 +6 3 +7 3 +7 6 +6 7 +3 7 diff --git a/graphx/data/users.txt b/graphx/data/users.txt new file mode 100644 index 0000000000..ce3d06c600 --- /dev/null +++ b/graphx/data/users.txt @@ -0,0 +1,6 @@ +1 BarackObama +2 ericschmidt +3 jeresig +4 justinbieber +6 matei_zaharia +7 odersky diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala index 809b6d0855..cf95267e77 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala @@ -106,7 +106,7 @@ object PageRank extends Logging { * @tparam ED the original edge attribute (not used) * * @param graph the graph on which to compute PageRank - * @param tol the tolerance allowed at convergence (smaller => more * accurate). + * @param tol the tolerance allowed at convergence (smaller => more accurate). * @param resetProb the random reset probability (alpha) * * @return the graph containing with each vertex containing the PageRank and each edge