From a696278c8defa91eff3a1cde5508c93dd9ff066c Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Tue, 15 Oct 2024 16:17:57 +0200 Subject: [PATCH 1/9] Add notebook for visualizing projections with PyVis --- README.md | 1 + .../pages/tutorials/visualize-with-pyvis.adoc | 201 +++++++++++ examples/visualize-with-pyvis.ipynb | 333 ++++++++++++++++++ 3 files changed, 535 insertions(+) create mode 100644 doc/modules/ROOT/pages/tutorials/visualize-with-pyvis.adoc create mode 100644 examples/visualize-with-pyvis.ipynb diff --git a/README.md b/README.md index 32fd749b7..b3c55992b 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,7 @@ Full end-to-end examples in Jupyter ready-to-run notebooks can be found in the [ * [Load data to a projected graph via graph construction](examples/load-data-via-graph-construction.ipynb) * [Heterogeneous Node Classification with HashGNN and Autotuning](https://github.com/neo4j/graph-data-science-client/tree/main/examples/heterogeneous-node-classification-with-hashgnn.ipynb) * [Perform inference using pre-trained KGE models](examples/kge-predict-transe-pyg-train.ipynb) +* [Visualize GDS Projections with PyVis](examples/visualize-with-pyvis.ipynb) ## Documentation diff --git a/doc/modules/ROOT/pages/tutorials/visualize-with-pyvis.adoc b/doc/modules/ROOT/pages/tutorials/visualize-with-pyvis.adoc new file mode 100644 index 000000000..960c40b4f --- /dev/null +++ b/doc/modules/ROOT/pages/tutorials/visualize-with-pyvis.adoc @@ -0,0 +1,201 @@ +// DO NOT EDIT - AsciiDoc file generated automatically + += GDS Projection Visualization with PyVis + + +https://colab.research.google.com/github/neo4j/graph-data-science-client/blob/main/examples/import-sample-export-gnn.ipynb[image:https://colab.research.google.com/assets/colab-badge.svg[Open +In Colab]] + + +This Jupyter notebook is hosted +https://github.com/neo4j/graph-data-science-client/blob/main/examples/visualize-with-pyvis.ipynb[here] +in the Neo4j Graph Data Science Client Github repository. + +The notebook exemplifies how to visualize a graph projection in the GDS +Graph Catalog using the `graphdatascience` +(https://neo4j.com/docs/graph-data-science-client/current/[docs]) and +`pyvis` (https://pyvis.readthedocs.io/en/latest/index.html[docs]) +libraries. + +== Prerequisites + +Running this notebook requires a Neo4j server with GDS installed. We +recommend using Neo4j Desktop with GDS, or AuraDS. + +Also required are of course the Python libraries `graphdatascience` and +`pyvis`: + +[source, python, role=no-test] +---- +%pip install graphdatascience pyvis +---- + +== Setup + +We start by importing our dependencies and setting up our GDS client +connection to the database. + +[source, python, role=no-test] +---- +from graphdatascience import GraphDataScience +import os +from pyvis.network import Network +---- + +[source, python, role=no-test] +---- +# Get Neo4j DB URI, credentials and name from environment if applicable +NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687") +NEO4J_AUTH = None +NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j") +if os.environ.get("NEO4J_USER") and os.environ.get("NEO4J_PASSWORD"): + NEO4J_AUTH = ( + os.environ.get("NEO4J_USER"), + os.environ.get("NEO4J_PASSWORD"), + ) +gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB) +---- + +== Sampling Cora + +Next we use the +https://neo4j.com/docs/graph-data-science-client/current/common-datasets/#_cora[built-in +Cora loader] to get the data into GDS. The nodes in the Cora dataset is +represented by academic papers, and the relationships connecting them +are citations. + +We will then sample a smaller representative subgraph from it that is +more suitable for visualization. + +[source, python, role=no-test] +---- +G = gds.graph.load_cora() +---- + +Let’s make sure we constructed the correct graph. + +[source, python, role=no-test] +---- +print(f"Metadata for our loaded Cora graph `G`: {G}") +print(f"Node labels present in `G`: {G.node_labels()}") +---- + +It’s looks correct! Now let’s go ahead and sample the graph. + +We use the random walk with restarts sampling algorithm to get a smaller +graph that structurally represents the full graph. In this example we +will use the algorithm’s default parameters, but check out +https://neo4j.com/docs/graph-data-science/current/management-ops/graph-creation/sampling/rwr/[the +algorithm’s docs] to see how you can for example specify the size of the +subgraph, and choose which start node around which the subgraph will be +sampled. + +[source, python, role=no-test] +---- +G_sample, _ = gds.alpha.graph.sample.rwr("cora_sample", G, randomSeed=42, concurrency=1) +---- + +We should have somewhere around 0.15 * 2708 ~ 406 nodes in our sample. +And let’s see how many relationships we got. + +[source, python, role=no-test] +---- +print(f"Number of nodes in our sample: {G_sample.node_count()}") +print(f"Number of relationships in our sample: {G_sample.relationship_count()}") +---- + +Let’s also compute +https://neo4j.com/docs/graph-data-science/current/algorithms/page-rank/[PageRank] +on our sample graph, in order to get an importance score that we call +``rank'' for each node. It will be interesting for context when we +visualize the graph. + +[source, python, role=no-test] +---- +gds.pageRank.mutate(G_sample, mutateProperty="rank") +---- + +== Exporting the sampled Cora graph + +We can now export the topology and node properties of our sampled graph +that we want to visualize. + +Let’s start by fetching the relationships. + +[source, python, role=no-test] +---- +sample_topology_df = gds.beta.graph.relationships.stream(G_sample) +display(sample_topology_df) +---- + +We get the right amount of rows, one for each expected relationship. So +that looks good. + +Next we should fetch the node properties we are interested in. Each node +will have a ``subject'' property which will be an integer 0,…,6 that +indicates which of seven academic subjects the paper represented by the +nodes belong to. We will also fetch the PageRank property ``rank'' that +we computed above. + +[source, python, role=no-test] +---- +sample_node_properties_df = gds.graph.nodeProperties.stream( + G_sample, + ["subject", "rank"], + separate_property_columns=True, +) +display(sample_node_properties_df) +---- + +Now that we have all the data we want to visualize, we can create a +network with PyVis. We color each node according to its ``subject'', and +size it according to its ``rank''. + +[source, python, role=no-test] +---- +net = Network(notebook = True, +cdn_resources="remote", + bgcolor = "#222222", + font_color = "white", + height = "750px", # Modify according to your screen size + width = "100%", +) + +# Seven suitable light colors, one for each "subject" +subject_to_color = ["#80cce9", "#fbd266", "#a9eebc", "#e53145", "#d2a6e2", "#f3f3f3", "#ff91af"] + +# Add all the nodes +for _, node in sample_node_properties_df.iterrows(): + net.add_node(int(node["nodeId"]), color=subject_to_color[int(node["subject"])], value=node["rank"]) + +# Add all the relationships +net.add_edges(zip(sample_topology_df["sourceNodeId"], sample_topology_df["targetNodeId"])) + +net.show("cora-sample.html") +---- + +Unsuprisingly we can see that papers largely seem clustered by academic +subject. We also note that some nodes appear larger in size, indicating +that they have a higher centrality score according to PageRank. + +We can scroll over the graphic to zoom in/out, and ``click and drag'' +the background to navigate to different parts of the network. If we +click on a node, it will be highlighted along with the relationships +connected to it. And if we ``click and drag'' a node, we can move it. + +Additionally one could enable more sophisticated navigational features +for searching and filtering by providing `select_menu = True` and +`filter_menu = True` respectively to the PyVis `Network` constructor +above. Check out the +https://pyvis.readthedocs.io/en/latest/index.html[PyVis documentation] +for this. + +== Cleanup + +We remove the Cora graphs from the GDS graph catalog to free up memory. + +[source, python, role=no-test] +---- +_ = G_sample.drop() +_ = G.drop() +---- diff --git a/examples/visualize-with-pyvis.ipynb b/examples/visualize-with-pyvis.ipynb new file mode 100644 index 000000000..ba273413f --- /dev/null +++ b/examples/visualize-with-pyvis.ipynb @@ -0,0 +1,333 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "82568b27", + "metadata": {}, + "source": [ + "# GDS Projection Visualization with PyVis" + ] + }, + { + "cell_type": "markdown", + "id": "c4b7883a", + "metadata": { + "colab_type": "text" + }, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "922d0fbb", + "metadata": {}, + "source": [ + "This Jupyter notebook is hosted [here](https://github.com/neo4j/graph-data-science-client/blob/main/examples/visualize-with-pyvis.ipynb) in the Neo4j Graph Data Science Client Github repository.\n", + "\n", + "The notebook exemplifies how to visualize a graph projection in the GDS Graph Catalog using the `graphdatascience` ([docs](https://neo4j.com/docs/graph-data-science-client/current/)) and `pyvis` ([docs](https://pyvis.readthedocs.io/en/latest/index.html)) libraries." + ] + }, + { + "cell_type": "markdown", + "id": "c64321df", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "Running this notebook requires a Neo4j server with GDS installed.\n", + "We recommend using Neo4j Desktop with GDS, or AuraDS.\n", + "\n", + "Also required are of course the Python libraries `graphdatascience` and `pyvis`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc4c3baf", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install graphdatascience pyvis" + ] + }, + { + "cell_type": "markdown", + "id": "3bdf33d6", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "We start by importing our dependencies and setting up our GDS client connection to the database." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26fb276b", + "metadata": {}, + "outputs": [], + "source": [ + "from graphdatascience import GraphDataScience\n", + "import os\n", + "from pyvis.network import Network" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "937f1476", + "metadata": {}, + "outputs": [], + "source": [ + "# Get Neo4j DB URI, credentials and name from environment if applicable\n", + "NEO4J_URI = os.environ.get(\"NEO4J_URI\", \"bolt://localhost:7687\")\n", + "NEO4J_AUTH = None\n", + "NEO4J_DB = os.environ.get(\"NEO4J_DB\", \"neo4j\")\n", + "if os.environ.get(\"NEO4J_USER\") and os.environ.get(\"NEO4J_PASSWORD\"):\n", + " NEO4J_AUTH = (\n", + " os.environ.get(\"NEO4J_USER\"),\n", + " os.environ.get(\"NEO4J_PASSWORD\"),\n", + " )\n", + "gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB)" + ] + }, + { + "cell_type": "markdown", + "id": "b1a8ab78", + "metadata": {}, + "source": [ + "## Sampling Cora\n", + "\n", + "Next we use the [built-in Cora loader](https://neo4j.com/docs/graph-data-science-client/current/common-datasets/#_cora) to get the data into GDS.\n", + "The nodes in the Cora dataset is represented by academic papers, and the relationships connecting them are citations.\n", + "\n", + "We will then sample a smaller representative subgraph from it that is more suitable for visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a575da60", + "metadata": {}, + "outputs": [], + "source": [ + "G = gds.graph.load_cora()" + ] + }, + { + "cell_type": "markdown", + "id": "723d6457", + "metadata": {}, + "source": [ + "Let's make sure we constructed the correct graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "428ac7b8", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Metadata for our loaded Cora graph `G`: {G}\")\n", + "print(f\"Node labels present in `G`: {G.node_labels()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3e8b8a83", + "metadata": {}, + "source": [ + "It's looks correct!\n", + "Now let's go ahead and sample the graph.\n", + "\n", + "We use the random walk with restarts sampling algorithm to get a smaller graph that structurally represents the full graph.\n", + "In this example we will use the algorithm's default parameters, but check out [the algorithm's docs](https://neo4j.com/docs/graph-data-science/current/management-ops/graph-creation/sampling/rwr/) to see how you can for example specify the size of the subgraph, and choose which start node around which the subgraph will be sampled." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d116738", + "metadata": {}, + "outputs": [], + "source": [ + "G_sample, _ = gds.alpha.graph.sample.rwr(\"cora_sample\", G, randomSeed=42, concurrency=1)" + ] + }, + { + "cell_type": "markdown", + "id": "324e0d4c", + "metadata": {}, + "source": [ + "We should have somewhere around 0.15 * 2708 ~ 406 nodes in our sample.\n", + "And let's see how many relationships we got." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d66ea2d5", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Number of nodes in our sample: {G_sample.node_count()}\")\n", + "print(f\"Number of relationships in our sample: {G_sample.relationship_count()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3dd66079", + "metadata": {}, + "source": [ + "Let's also compute [PageRank](https://neo4j.com/docs/graph-data-science/current/algorithms/page-rank/) on our sample graph, in order to get an importance score that we call \"rank\" for each node.\n", + "It will be interesting for context when we visualize the graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d80d653", + "metadata": {}, + "outputs": [], + "source": [ + "gds.pageRank.mutate(G_sample, mutateProperty=\"rank\")" + ] + }, + { + "cell_type": "markdown", + "id": "722caa7b", + "metadata": {}, + "source": [ + "## Exporting the sampled Cora graph\n", + "\n", + "We can now export the topology and node properties of our sampled graph that we want to visualize.\n", + "\n", + "Let's start by fetching the relationships." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3c76b25", + "metadata": {}, + "outputs": [], + "source": [ + "sample_topology_df = gds.beta.graph.relationships.stream(G_sample)\n", + "display(sample_topology_df)" + ] + }, + { + "cell_type": "markdown", + "id": "a4ad967b", + "metadata": {}, + "source": [ + "We get the right amount of rows, one for each expected relationship.\n", + "So that looks good.\n", + "\n", + "Next we should fetch the node properties we are interested in.\n", + "Each node will have a \"subject\" property which will be an integer 0,...,6 that indicates which of seven academic subjects the paper represented by the nodes belong to.\n", + "We will also fetch the PageRank property \"rank\" that we computed above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c52e3b2", + "metadata": {}, + "outputs": [], + "source": [ + "sample_node_properties_df = gds.graph.nodeProperties.stream(\n", + " G_sample,\n", + " [\"subject\", \"rank\"],\n", + " separate_property_columns=True,\n", + ")\n", + "display(sample_node_properties_df)" + ] + }, + { + "cell_type": "markdown", + "id": "ff81f977", + "metadata": {}, + "source": [ + "Now that we have all the data we want to visualize, we can create a network with PyVis.\n", + "We color each node according to its \"subject\", and size it according to its \"rank\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93cb3bdf", + "metadata": {}, + "outputs": [], + "source": [ + "net = Network(notebook = True,\n", + "cdn_resources=\"remote\",\n", + " bgcolor = \"#222222\",\n", + " font_color = \"white\",\n", + " height = \"750px\", # Modify according to your screen size\n", + " width = \"100%\",\n", + ")\n", + "\n", + "# Seven suitable light colors, one for each \"subject\"\n", + "subject_to_color = [\"#80cce9\", \"#fbd266\", \"#a9eebc\", \"#e53145\", \"#d2a6e2\", \"#f3f3f3\", \"#ff91af\"]\n", + "\n", + "# Add all the nodes\n", + "for _, node in sample_node_properties_df.iterrows():\n", + " net.add_node(int(node[\"nodeId\"]), color=subject_to_color[int(node[\"subject\"])], value=node[\"rank\"])\n", + "\n", + "# Add all the relationships\n", + "net.add_edges(zip(sample_topology_df[\"sourceNodeId\"], sample_topology_df[\"targetNodeId\"]))\n", + "\n", + "net.show(\"cora-sample.html\")" + ] + }, + { + "cell_type": "markdown", + "id": "8f8bc4a2", + "metadata": {}, + "source": [ + "Unsuprisingly we can see that papers largely seem clustered by academic subject.\n", + "We also note that some nodes appear larger in size, indicating that they have a higher centrality score according to PageRank.\n", + "\n", + "We can scroll over the graphic to zoom in/out, and \"click and drag\" the background to navigate to different parts of the network.\n", + "If we click on a node, it will be highlighted along with the relationships connected to it.\n", + "And if we \"click and drag\" a node, we can move it.\n", + "\n", + "Additionally one could enable more sophisticated navigational features for searching and filtering by providing `select_menu = True` and `filter_menu = True` respectively to the PyVis `Network` constructor above.\n", + "Check out the [PyVis documentation](https://pyvis.readthedocs.io/en/latest/index.html) for this." + ] + }, + { + "cell_type": "markdown", + "id": "928156de", + "metadata": {}, + "source": [ + "## Cleanup\n", + "\n", + "We remove the Cora graphs from the GDS graph catalog to free up memory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33523a0b", + "metadata": {}, + "outputs": [], + "source": [ + "_ = G_sample.drop()\n", + "_ = G.drop()" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From fe47d91c459250ad306e353856d1f6b53444d72a Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Wed, 16 Oct 2024 14:15:49 +0200 Subject: [PATCH 2/9] Address review comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Florentin Dörre --- README.md | 2 +- .../{visualize-with-pyvis.adoc => visualize.adoc} | 8 ++++---- doc/modules/ROOT/partials/tutorial-list.adoc | 1 + examples/{visualize-with-pyvis.ipynb => visualize.ipynb} | 8 ++++---- 4 files changed, 10 insertions(+), 9 deletions(-) rename doc/modules/ROOT/pages/tutorials/{visualize-with-pyvis.adoc => visualize.adoc} (95%) rename examples/{visualize-with-pyvis.ipynb => visualize.ipynb} (96%) diff --git a/README.md b/README.md index b3c55992b..cd7dd5856 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ Full end-to-end examples in Jupyter ready-to-run notebooks can be found in the [ * [Load data to a projected graph via graph construction](examples/load-data-via-graph-construction.ipynb) * [Heterogeneous Node Classification with HashGNN and Autotuning](https://github.com/neo4j/graph-data-science-client/tree/main/examples/heterogeneous-node-classification-with-hashgnn.ipynb) * [Perform inference using pre-trained KGE models](examples/kge-predict-transe-pyg-train.ipynb) -* [Visualize GDS Projections with PyVis](examples/visualize-with-pyvis.ipynb) +* [Visualize GDS Projections](examples/visualize.ipynb) ## Documentation diff --git a/doc/modules/ROOT/pages/tutorials/visualize-with-pyvis.adoc b/doc/modules/ROOT/pages/tutorials/visualize.adoc similarity index 95% rename from doc/modules/ROOT/pages/tutorials/visualize-with-pyvis.adoc rename to doc/modules/ROOT/pages/tutorials/visualize.adoc index 960c40b4f..545a585b3 100644 --- a/doc/modules/ROOT/pages/tutorials/visualize-with-pyvis.adoc +++ b/doc/modules/ROOT/pages/tutorials/visualize.adoc @@ -1,6 +1,6 @@ // DO NOT EDIT - AsciiDoc file generated automatically -= GDS Projection Visualization with PyVis += Visualizing GDS Projections https://colab.research.google.com/github/neo4j/graph-data-science-client/blob/main/examples/import-sample-export-gnn.ipynb[image:https://colab.research.google.com/assets/colab-badge.svg[Open @@ -92,7 +92,7 @@ sampled. [source, python, role=no-test] ---- -G_sample, _ = gds.alpha.graph.sample.rwr("cora_sample", G, randomSeed=42, concurrency=1) +G_sample, _ = gds.graph.sample.rwr("cora_sample", G, randomSeed=42, concurrency=1) ---- We should have somewhere around 0.15 * 2708 ~ 406 nodes in our sample. @@ -124,7 +124,7 @@ Let’s start by fetching the relationships. [source, python, role=no-test] ---- -sample_topology_df = gds.beta.graph.relationships.stream(G_sample) +sample_topology_df = gds.graph.relationships.stream(G_sample) display(sample_topology_df) ---- @@ -174,7 +174,7 @@ net.add_edges(zip(sample_topology_df["sourceNodeId"], sample_topology_df["target net.show("cora-sample.html") ---- -Unsuprisingly we can see that papers largely seem clustered by academic +Unsurprisingly we can see that papers largely seem clustered by academic subject. We also note that some nodes appear larger in size, indicating that they have a higher centrality score according to PageRank. diff --git a/doc/modules/ROOT/partials/tutorial-list.adoc b/doc/modules/ROOT/partials/tutorial-list.adoc index d3faa903f..15c1c2cba 100644 --- a/doc/modules/ROOT/partials/tutorial-list.adoc +++ b/doc/modules/ROOT/partials/tutorial-list.adoc @@ -7,5 +7,6 @@ * xref:tutorials/node-regression-with-subgraph-and-graph-sample.adoc[] * xref:tutorials/heterogeneous-node-classification-with-hashgnn.adoc[] * xref:tutorials/kge-predict-transe-pyg-train.adoc[] +* xref:tutorials/visualize.adoc[] * xref:tutorials/gds-sessions.adoc[] (Beta) * xref:tutorials/gds-sessions-self-managed.adoc[] (Beta) diff --git a/examples/visualize-with-pyvis.ipynb b/examples/visualize.ipynb similarity index 96% rename from examples/visualize-with-pyvis.ipynb rename to examples/visualize.ipynb index ba273413f..b6d382987 100644 --- a/examples/visualize-with-pyvis.ipynb +++ b/examples/visualize.ipynb @@ -5,7 +5,7 @@ "id": "82568b27", "metadata": {}, "source": [ - "# GDS Projection Visualization with PyVis" + "# Visualizing GDS Projections" ] }, { @@ -155,7 +155,7 @@ "metadata": {}, "outputs": [], "source": [ - "G_sample, _ = gds.alpha.graph.sample.rwr(\"cora_sample\", G, randomSeed=42, concurrency=1)" + "G_sample, _ = gds.graph.sample.rwr(\"cora_sample\", G, randomSeed=42, concurrency=1)" ] }, { @@ -216,7 +216,7 @@ "metadata": {}, "outputs": [], "source": [ - "sample_topology_df = gds.beta.graph.relationships.stream(G_sample)\n", + "sample_topology_df = gds.graph.relationships.stream(G_sample)\n", "display(sample_topology_df)" ] }, @@ -290,7 +290,7 @@ "id": "8f8bc4a2", "metadata": {}, "source": [ - "Unsuprisingly we can see that papers largely seem clustered by academic subject.\n", + "Unsurprisingly we can see that papers largely seem clustered by academic subject.\n", "We also note that some nodes appear larger in size, indicating that they have a higher centrality score according to PageRank.\n", "\n", "We can scroll over the graphic to zoom in/out, and \"click and drag\" the background to navigate to different parts of the network.\n", From 090f00847559f50228d67247e0286e1ad7c43fef Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Wed, 16 Oct 2024 14:26:12 +0200 Subject: [PATCH 3/9] WIP --- .../ROOT/pages/tutorials/visualize.adoc | 65 +++ doc/modules/ROOT/partials/cora-sample.html | 444 ++++++++++++++++++ examples/cora-sample.html | 444 ++++++++++++++++++ examples/visualize.ipynb | 356 +++++++++++++- 4 files changed, 1291 insertions(+), 18 deletions(-) create mode 100644 doc/modules/ROOT/partials/cora-sample.html create mode 100644 examples/cora-sample.html diff --git a/doc/modules/ROOT/pages/tutorials/visualize.adoc b/doc/modules/ROOT/pages/tutorials/visualize.adoc index 545a585b3..eb7722004 100644 --- a/doc/modules/ROOT/pages/tutorials/visualize.adoc +++ b/doc/modules/ROOT/pages/tutorials/visualize.adoc @@ -80,6 +80,10 @@ print(f"Metadata for our loaded Cora graph `G`: {G}") print(f"Node labels present in `G`: {G.node_labels()}") ---- + +Metadata for our loaded Cora graph `G`: Graph(name=cora, node_count=2708, relationship_count=5429) +Node labels present in `G`: ['Paper'] + It’s looks correct! Now let’s go ahead and sample the graph. We use the random walk with restarts sampling algorithm to get a smaller @@ -104,6 +108,10 @@ print(f"Number of nodes in our sample: {G_sample.node_count()}") print(f"Number of relationships in our sample: {G_sample.relationship_count()}") ---- + +Number of nodes in our sample: 406 +Number of relationships in our sample: 532 + Let’s also compute https://neo4j.com/docs/graph-data-science/current/algorithms/page-rank/[PageRank] on our sample graph, in order to get an importance score that we call @@ -115,6 +123,19 @@ visualize the graph. gds.pageRank.mutate(G_sample, mutateProperty="rank") ---- +---- +mutateMillis 0 +nodePropertiesWritten 406 +ranIterations 20 +didConverge False +centralityDistribution {'min': 0.14999961853027344, 'max': 2.27294921... +postProcessingMillis 1 +preProcessingMillis 0 +computeMillis 7 +configuration {'mutateProperty': 'rank', 'jobId': '5ca450ff-... +Name: 0, dtype: object +---- + == Exporting the sampled Cora graph We can now export the topology and node properties of our sampled graph @@ -128,6 +149,24 @@ sample_topology_df = gds.graph.relationships.stream(G_sample) display(sample_topology_df) ---- +[cols=",,,",options="header",] +|=== +| |sourceNodeId |targetNodeId |relationshipType +|0 |31336 |31349 |CITES +|1 |31336 |686532 |CITES +|2 |31336 |1129442 |CITES +|3 |31349 |686532 |CITES +|4 |31353 |31336 |CITES +|... |... |... |... +|527 |34961 |31043 |CITES +|528 |34961 |22883 |CITES +|529 |102879 |9513 |CITES +|530 |102884 |9513 |CITES +|531 |767763 |1136631 |CITES +|=== + +532 rows × 3 columns + We get the right amount of rows, one for each expected relationship. So that looks good. @@ -147,6 +186,24 @@ sample_node_properties_df = gds.graph.nodeProperties.stream( display(sample_node_properties_df) ---- +[cols=",,,",options="header",] +|=== +| |nodeId |rank |subject +|0 |164 |0.245964 |4.0 +|1 |434 |0.158500 |2.0 +|2 |1694 |0.961240 |5.0 +|3 |1949 |0.224912 |6.0 +|4 |1952 |0.150000 |6.0 +|... |... |... |... +|401 |1154103 |0.319498 |3.0 +|402 |1154124 |0.627706 |0.0 +|403 |1154169 |0.154784 |0.0 +|404 |1154251 |0.187675 |0.0 +|405 |1154276 |0.277500 |0.0 +|=== + +406 rows × 3 columns + Now that we have all the data we want to visualize, we can create a network with PyVis. We color each node according to its ``subject'', and size it according to its ``rank''. @@ -174,6 +231,14 @@ net.add_edges(zip(sample_topology_df["sourceNodeId"], sample_topology_df["target net.show("cora-sample.html") ---- + +ifdef::backend-html5[] +++++ +include::ROOT:partial$/cora-sample.html[] +++++ +endif::[] + + Unsurprisingly we can see that papers largely seem clustered by academic subject. We also note that some nodes appear larger in size, indicating that they have a higher centrality score according to PageRank. diff --git a/doc/modules/ROOT/partials/cora-sample.html b/doc/modules/ROOT/partials/cora-sample.html new file mode 100644 index 000000000..0f2bc651e --- /dev/null +++ b/doc/modules/ROOT/partials/cora-sample.html @@ -0,0 +1,444 @@ + + + + + + + + + + + + + + + +
+

+
+ + + + + + +
+

+
+ + + + + +
+ + +
+
+ + +
+
+
0%
+
+
+
+
+
+ + + + + + \ No newline at end of file diff --git a/examples/cora-sample.html b/examples/cora-sample.html new file mode 100644 index 000000000..0f2bc651e --- /dev/null +++ b/examples/cora-sample.html @@ -0,0 +1,444 @@ + + + + + + + + + + + + + + + +
+

+
+ + + + + + +
+

+
+ + + + + +
+ + +
+
+ + +
+
+
0%
+
+
+
+
+
+ + + + + + \ No newline at end of file diff --git a/examples/visualize.ipynb b/examples/visualize.ipynb index b6d382987..d69e73c7b 100644 --- a/examples/visualize.ipynb +++ b/examples/visualize.ipynb @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "26fb276b", "metadata": {}, "outputs": [], @@ -77,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "937f1476", "metadata": {}, "outputs": [], @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "a575da60", "metadata": {}, "outputs": [], @@ -127,10 +127,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "428ac7b8", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Metadata for our loaded Cora graph `G`: Graph(name=cora, node_count=2708, relationship_count=5429)\n", + "Node labels present in `G`: ['Paper']\n" + ] + } + ], "source": [ "print(f\"Metadata for our loaded Cora graph `G`: {G}\")\n", "print(f\"Node labels present in `G`: {G.node_labels()}\")" @@ -150,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "3d116738", "metadata": {}, "outputs": [], @@ -169,10 +178,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "d66ea2d5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of nodes in our sample: 406\n", + "Number of relationships in our sample: 532\n" + ] + } + ], "source": [ "print(f\"Number of nodes in our sample: {G_sample.node_count()}\")\n", "print(f\"Number of relationships in our sample: {G_sample.relationship_count()}\")" @@ -189,10 +207,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "3d80d653", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "mutateMillis 0\n", + "nodePropertiesWritten 406\n", + "ranIterations 20\n", + "didConverge False\n", + "centralityDistribution {'min': 0.14999961853027344, 'max': 2.27294921...\n", + "postProcessingMillis 1\n", + "preProcessingMillis 0\n", + "computeMillis 7\n", + "configuration {'mutateProperty': 'rank', 'jobId': '5ca450ff-...\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "gds.pageRank.mutate(G_sample, mutateProperty=\"rank\")" ] @@ -211,10 +249,129 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "d3c76b25", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sourceNodeIdtargetNodeIdrelationshipType
03133631349CITES
131336686532CITES
2313361129442CITES
331349686532CITES
43135331336CITES
............
5273496131043CITES
5283496122883CITES
5291028799513CITES
5301028849513CITES
5317677631136631CITES
\n", + "

532 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " sourceNodeId targetNodeId relationshipType\n", + "0 31336 31349 CITES\n", + "1 31336 686532 CITES\n", + "2 31336 1129442 CITES\n", + "3 31349 686532 CITES\n", + "4 31353 31336 CITES\n", + ".. ... ... ...\n", + "527 34961 31043 CITES\n", + "528 34961 22883 CITES\n", + "529 102879 9513 CITES\n", + "530 102884 9513 CITES\n", + "531 767763 1136631 CITES\n", + "\n", + "[532 rows x 3 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "sample_topology_df = gds.graph.relationships.stream(G_sample)\n", "display(sample_topology_df)" @@ -235,10 +392,129 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "1c52e3b2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nodeIdranksubject
01640.2459644.0
14340.1585002.0
216940.9612405.0
319490.2249126.0
419520.1500006.0
............
40111541030.3194983.0
40211541240.6277060.0
40311541690.1547840.0
40411542510.1876750.0
40511542760.2775000.0
\n", + "

406 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " nodeId rank subject\n", + "0 164 0.245964 4.0\n", + "1 434 0.158500 2.0\n", + "2 1694 0.961240 5.0\n", + "3 1949 0.224912 6.0\n", + "4 1952 0.150000 6.0\n", + ".. ... ... ...\n", + "401 1154103 0.319498 3.0\n", + "402 1154124 0.627706 0.0\n", + "403 1154169 0.154784 0.0\n", + "404 1154251 0.187675 0.0\n", + "405 1154276 0.277500 0.0\n", + "\n", + "[406 rows x 3 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "sample_node_properties_df = gds.graph.nodeProperties.stream(\n", " G_sample,\n", @@ -259,10 +535,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "93cb3bdf", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cora-sample.html\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "net = Network(notebook = True,\n", "cdn_resources=\"remote\",\n", @@ -313,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "33523a0b", "metadata": {}, "outputs": [], @@ -324,8 +630,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" } }, "nbformat": 4, From 3982faa1bb388bafd160220bcf013daf8f6c6e6c Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Thu, 17 Oct 2024 11:05:38 +0200 Subject: [PATCH 4/9] Add G.visualize() method --- graphdatascience/graph/graph_object.py | 72 ++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/graphdatascience/graph/graph_object.py b/graphdatascience/graph/graph_object.py index e5e06eb5a..2803c3e3c 100644 --- a/graphdatascience/graph/graph_object.py +++ b/graphdatascience/graph/graph_object.py @@ -2,6 +2,7 @@ from types import TracebackType from typing import Any, List, Optional, Type, Union +from uuid import uuid4 from pandas import Series @@ -230,3 +231,74 @@ def __repr__(self) -> str: "memoryUsage", ] return f"{self.__class__.__name__}({self._graph_info(yields=yield_fields).to_dict()})" + + def visualize(self, node_count: int = 100): + visual_graph = self._name + if self.node_count() > node_count: + ratio = float(node_count) / self.node_count() + visual_graph = str(uuid4()) + self._query_runner.call_procedure( + endpoint="gds.graph.sample.rwr", + params=CallParameters( + graph_name=visual_graph, fromGraphName=self._name, config=dict(samplingRatio=ratio) + ), + custom_error=False, + ) + + pr_prop = str(uuid4()) + self._query_runner.call_procedure( + endpoint="gds.pageRank.mutate", + params=CallParameters(graph_name=visual_graph, config=dict(mutateProperty=pr_prop)), + custom_error=False, + ) + + result = self._query_runner.call_procedure( + endpoint="gds.graph.nodeProperties.stream", + params=CallParameters(graph_name=visual_graph, properties=[pr_prop]), + custom_error=False, + ) + + # new format was requested, but the query was run via Cypher + if "propertyValue" in result.keys(): + wide_result = result.pivot(index=["nodeId"], columns=["nodeProperty"], values="propertyValue") + result = wide_result.reset_index() + result.columns.name = None + node_properties_df = result + + relationships_df = self._query_runner.call_procedure( + endpoint="gds.graph.relationships.stream", + params=CallParameters(graph_name=visual_graph), + custom_error=False, + ) + + if visual_graph != self._name: + self._query_runner.call_procedure( + endpoint="gds.graph.drop", + params=CallParameters(graph_name=visual_graph), + custom_error=False, + ) + else: + self._query_runner.call_procedure( + endpoint="gds.graph.nodeProperties.drop", + params=CallParameters(graph_name=visual_graph, nodeProperties=pr_prop), + custom_error=False, + ) + + from pyvis.network import Network + + net = Network( + notebook=True, + cdn_resources="remote", + bgcolor="#222222", + font_color="white", + height="750px", # Modify according to your screen size + width="100%", + ) + + for _, node in node_properties_df.iterrows(): + net.add_node(int(node["nodeId"]), value=node[pr_prop]) + + # Add all the relationships + net.add_edges(zip(relationships_df["sourceNodeId"], relationships_df["targetNodeId"])) + + return net.show(f"{self._name}.html") From 5575cf3c5d4ed66c0497dc08394c665824051efe Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Fri, 18 Oct 2024 09:19:14 +0200 Subject: [PATCH 5/9] Add node coloring by label for `G.visualize` --- graphdatascience/graph/graph_object.py | 42 +++++++++++++++++++++----- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/graphdatascience/graph/graph_object.py b/graphdatascience/graph/graph_object.py index 2803c3e3c..2bf2b8726 100644 --- a/graphdatascience/graph/graph_object.py +++ b/graphdatascience/graph/graph_object.py @@ -1,5 +1,7 @@ from __future__ import annotations +import colorsys +import random from types import TracebackType from typing import Any, List, Optional, Type, Union from uuid import uuid4 @@ -232,16 +234,18 @@ def __repr__(self) -> str: ] return f"{self.__class__.__name__}({self._graph_info(yields=yield_fields).to_dict()})" - def visualize(self, node_count: int = 100): + def visualize(self, node_count: int = 100, center_nodes: Optional[List[int]] = None) -> Any: visual_graph = self._name if self.node_count() > node_count: - ratio = float(node_count) / self.node_count() visual_graph = str(uuid4()) + config = dict(samplingRatio=float(node_count) / self.node_count()) + + if center_nodes is not None: + config["startNodes"] = center_nodes + self._query_runner.call_procedure( endpoint="gds.graph.sample.rwr", - params=CallParameters( - graph_name=visual_graph, fromGraphName=self._name, config=dict(samplingRatio=ratio) - ), + params=CallParameters(graph_name=visual_graph, fromGraphName=self._name, config=config), custom_error=False, ) @@ -254,13 +258,22 @@ def visualize(self, node_count: int = 100): result = self._query_runner.call_procedure( endpoint="gds.graph.nodeProperties.stream", - params=CallParameters(graph_name=visual_graph, properties=[pr_prop]), + params=CallParameters( + graph_name=visual_graph, + properties=[pr_prop], + nodeLabels=self.node_labels(), + config=dict(listNodeLabels=True), + ), custom_error=False, ) # new format was requested, but the query was run via Cypher if "propertyValue" in result.keys(): wide_result = result.pivot(index=["nodeId"], columns=["nodeProperty"], values="propertyValue") + # nodeLabels cannot be an index column of the pivot as its not hashable + # so we need to manually join it back in + labels_df = result[["nodeId", "nodeLabels"]].set_index("nodeId") + wide_result = wide_result.join(labels_df, on="nodeId") result = wide_result.reset_index() result.columns.name = None node_properties_df = result @@ -271,6 +284,7 @@ def visualize(self, node_count: int = 100): custom_error=False, ) + # Clean up if visual_graph != self._name: self._query_runner.call_procedure( endpoint="gds.graph.drop", @@ -289,16 +303,28 @@ def visualize(self, node_count: int = 100): net = Network( notebook=True, cdn_resources="remote", - bgcolor="#222222", + bgcolor="#222222", # Dark background font_color="white", height="750px", # Modify according to your screen size width="100%", ) + label_to_color = {label: self._random_bright_color() for label in self.node_labels()} + for _, node in node_properties_df.iterrows(): - net.add_node(int(node["nodeId"]), value=node[pr_prop]) + net.add_node( + int(node["nodeId"]), + value=node[pr_prop], + color=label_to_color[node["nodeLabels"][0]], + title=str(node["nodeId"]), + ) # Add all the relationships net.add_edges(zip(relationships_df["sourceNodeId"], relationships_df["targetNodeId"])) return net.show(f"{self._name}.html") + + @staticmethod + def _random_bright_color() -> str: + h = random.randint(0, 255) / 255.0 + return "#%02X%02X%02X" % tuple(map(lambda x: int(x * 255), colorsys.hls_to_rgb(h, 0.7, 1.0))) From 1498df5e4642e36c220842ca04111d5b755ab7d7 Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Fri, 18 Oct 2024 09:57:05 +0200 Subject: [PATCH 6/9] Add more info when hovering node for `G.visualize` --- graphdatascience/graph/graph_object.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/graphdatascience/graph/graph_object.py b/graphdatascience/graph/graph_object.py index 2bf2b8726..6e91d1ce2 100644 --- a/graphdatascience/graph/graph_object.py +++ b/graphdatascience/graph/graph_object.py @@ -234,7 +234,9 @@ def __repr__(self) -> str: ] return f"{self.__class__.__name__}({self._graph_info(yields=yield_fields).to_dict()})" - def visualize(self, node_count: int = 100, center_nodes: Optional[List[int]] = None) -> Any: + def visualize( + self, node_count: int = 100, center_nodes: Optional[List[int]] = None, include_node_properties: List[str] = None + ) -> Any: visual_graph = self._name if self.node_count() > node_count: visual_graph = str(uuid4()) @@ -256,11 +258,15 @@ def visualize(self, node_count: int = 100, center_nodes: Optional[List[int]] = N custom_error=False, ) + node_properties = [pr_prop] + if include_node_properties is not None: + node_properties.extend(include_node_properties) + result = self._query_runner.call_procedure( endpoint="gds.graph.nodeProperties.stream", params=CallParameters( graph_name=visual_graph, - properties=[pr_prop], + properties=node_properties, nodeLabels=self.node_labels(), config=dict(listNodeLabels=True), ), @@ -312,11 +318,17 @@ def visualize(self, node_count: int = 100, center_nodes: Optional[List[int]] = N label_to_color = {label: self._random_bright_color() for label in self.node_labels()} for _, node in node_properties_df.iterrows(): + title = f"Node ID: {node['nodeId']}\nLabels: {node['nodeLabels']}" + if include_node_properties is not None: + title += f"\nNode properties:" + for prop in include_node_properties: + title += f"\n{prop} = {node[prop]}" + net.add_node( int(node["nodeId"]), value=node[pr_prop], color=label_to_color[node["nodeLabels"][0]], - title=str(node["nodeId"]), + title=title, ) # Add all the relationships From 41f5f1f7aab9ccc5699672326f08b260e2c39082 Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Fri, 18 Oct 2024 10:14:40 +0200 Subject: [PATCH 7/9] Add more features to `G.visualize` --- graphdatascience/graph/graph_object.py | 31 +++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/graphdatascience/graph/graph_object.py b/graphdatascience/graph/graph_object.py index 6e91d1ce2..a6889c42a 100644 --- a/graphdatascience/graph/graph_object.py +++ b/graphdatascience/graph/graph_object.py @@ -235,7 +235,12 @@ def __repr__(self) -> str: return f"{self.__class__.__name__}({self._graph_info(yields=yield_fields).to_dict()})" def visualize( - self, node_count: int = 100, center_nodes: Optional[List[int]] = None, include_node_properties: List[str] = None + self, + notebook: bool = True, + node_count: int = 100, + center_nodes: Optional[List[int]] = None, + include_node_properties: List[str] = None, + color_property: Optional[str] = None, ) -> Any: visual_graph = self._name if self.node_count() > node_count: @@ -262,6 +267,12 @@ def visualize( if include_node_properties is not None: node_properties.extend(include_node_properties) + if color_property is not None: + node_properties.append(color_property) + + # Remove possible duplicates + node_properties = list(set(node_properties)) + result = self._query_runner.call_procedure( endpoint="gds.graph.nodeProperties.stream", params=CallParameters( @@ -307,15 +318,20 @@ def visualize( from pyvis.network import Network net = Network( - notebook=True, - cdn_resources="remote", + notebook=True if notebook else False, + cdn_resources="remote" if notebook else "local", bgcolor="#222222", # Dark background font_color="white", height="750px", # Modify according to your screen size width="100%", ) - label_to_color = {label: self._random_bright_color() for label in self.node_labels()} + if color_property is None: + color_map = {label: self._random_bright_color() for label in self.node_labels()} + else: + color_map = { + prop_val: self._random_bright_color() for prop_val in node_properties_df[color_property].unique() + } for _, node in node_properties_df.iterrows(): title = f"Node ID: {node['nodeId']}\nLabels: {node['nodeLabels']}" @@ -324,10 +340,15 @@ def visualize( for prop in include_node_properties: title += f"\n{prop} = {node[prop]}" + if color_property is None: + color = color_map[node["nodeLabels"][0]] + else: + color = color_map[node[color_property]] + net.add_node( int(node["nodeId"]), value=node[pr_prop], - color=label_to_color[node["nodeLabels"][0]], + color=color, title=title, ) From b01e44cb36b12cc5f34caaa557c25b54d6c230eb Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Fri, 18 Oct 2024 13:56:51 +0200 Subject: [PATCH 8/9] Even more features for `G.visualize` --- graphdatascience/graph/graph_object.py | 119 +++++++++++++++++++------ 1 file changed, 90 insertions(+), 29 deletions(-) diff --git a/graphdatascience/graph/graph_object.py b/graphdatascience/graph/graph_object.py index a6889c42a..ca5e95b6d 100644 --- a/graphdatascience/graph/graph_object.py +++ b/graphdatascience/graph/graph_object.py @@ -1,4 +1,5 @@ from __future__ import annotations +from itertools import chain import colorsys import random @@ -82,7 +83,6 @@ def node_count(self) -> int: """ Returns: the number of nodes in the graph - """ return self._graph_info(["nodeCount"]) # type: ignore @@ -191,7 +191,6 @@ def drop(self, failIfMissing: bool = False) -> "Series[str]": Returns: the result of the drop operation - """ result = self._query_runner.call_procedure( endpoint="gds.graph.drop", @@ -205,7 +204,6 @@ def creation_time(self) -> Any: # neo4j.time.DateTime not exported """ Returns: the creation time of the graph - """ return self._graph_info(["creationTime"]) @@ -236,12 +234,56 @@ def __repr__(self) -> str: def visualize( self, - notebook: bool = True, node_count: int = 100, + directed: bool = True, center_nodes: Optional[List[int]] = None, - include_node_properties: List[str] = None, color_property: Optional[str] = None, + size_property: Optional[str] = None, + include_node_properties: Optional[List[str]] = None, + rel_weight_property: Optional[str] = None, + notebook: bool = True, + px_height: int = 750, + theme: str = "dark", ) -> Any: + """ + Visualize the `Graph` in an interactive graphical interface. + The graph will be sampled down to specified `node_count` to limit computationally expensive rendering. + + Args: + node_count: number of nodes in the graph to be visualized + directed: whether or not to display relationships as directed + center_nodes: nodes around subgraph will be sampled, if sampling is necessary + color_property: node property that determines node categories for coloring. Default is to use node labels + size_property: node property that determines the size of nodes. Default is to compute a page rank for this + include_node_properties: node properties to include for mouse-over inspection + rel_weight_property: relationship property that determines width of relationships + notebook: whether or not the code is run in a notebook + px_height: the height of the graphic containing output the visualization + theme: coloring theme for the visualization. "light" or "dark" + + Returns: + an interactive graphical visualization of the specified graph + """ + + actual_node_properties = list(chain.from_iterable(self.node_properties().to_dict().values())) + if (color_property is not None) and (color_property not in actual_node_properties): + raise ValueError(f"There is no node property '{color_property}' in graph '{self._name}'") + + if size_property is not None and size_property not in actual_node_properties: + raise ValueError(f"There is no node property '{size_property}' in graph '{self._name}'") + + if include_node_properties is not None: + for prop in include_node_properties: + if prop not in actual_node_properties: + raise ValueError(f"There is no node property '{prop}' in graph '{self._name}'") + + actual_rel_properties = list(chain.from_iterable(self.relationship_properties().to_dict().values())) + if rel_weight_property is not None and rel_weight_property not in actual_rel_properties: + raise ValueError(f"There is no relationship property '{rel_weight_property}' in graph '{self._name}'") + + if theme not in {"light", "dark"}: + raise ValueError(f"Color `theme` '{theme}' is not allowed. Must be either 'light' or 'dark'") + visual_graph = self._name if self.node_count() > node_count: visual_graph = str(uuid4()) @@ -256,14 +298,19 @@ def visualize( custom_error=False, ) - pr_prop = str(uuid4()) - self._query_runner.call_procedure( - endpoint="gds.pageRank.mutate", - params=CallParameters(graph_name=visual_graph, config=dict(mutateProperty=pr_prop)), - custom_error=False, - ) + # Make sure we always have at least a size property so that we can run `gds.graph.nodeProperties.stream` + if size_property is None: + size_property = str(uuid4()) + self._query_runner.call_procedure( + endpoint="gds.pageRank.mutate", + params=CallParameters(graph_name=visual_graph, config=dict(mutateProperty=size_property)), + custom_error=False, + ) + clean_up_size_prop = True + else: + clean_up_size_prop = False - node_properties = [pr_prop] + node_properties = [size_property] if include_node_properties is not None: node_properties.extend(include_node_properties) @@ -295,11 +342,18 @@ def visualize( result.columns.name = None node_properties_df = result - relationships_df = self._query_runner.call_procedure( - endpoint="gds.graph.relationships.stream", - params=CallParameters(graph_name=visual_graph), - custom_error=False, - ) + if rel_weight_property is None: + relationships_df = self._query_runner.call_procedure( + endpoint="gds.graph.relationships.stream", + params=CallParameters(graph_name=visual_graph), + custom_error=False, + ) + else: + relationships_df = self._query_runner.call_procedure( + endpoint="gds.graph.relationshipProperty.stream", + params=CallParameters(graph_name=visual_graph, properties=rel_weight_property), + custom_error=False, + ) # Clean up if visual_graph != self._name: @@ -308,10 +362,10 @@ def visualize( params=CallParameters(graph_name=visual_graph), custom_error=False, ) - else: + elif clean_up_size_prop: self._query_runner.call_procedure( endpoint="gds.graph.nodeProperties.drop", - params=CallParameters(graph_name=visual_graph, nodeProperties=pr_prop), + params=CallParameters(graph_name=visual_graph, nodeProperties=size_property), custom_error=False, ) @@ -320,19 +374,21 @@ def visualize( net = Network( notebook=True if notebook else False, cdn_resources="remote" if notebook else "local", - bgcolor="#222222", # Dark background - font_color="white", - height="750px", # Modify according to your screen size + directed=directed, + bgcolor="#222222" if theme == "dark" else "#F2F2F2", + font_color="white" if theme == "dark" else "black", + height=f"{px_height}px", width="100%", ) if color_property is None: - color_map = {label: self._random_bright_color() for label in self.node_labels()} + color_map = {label: self._random_themed_color(theme) for label in self.node_labels()} else: color_map = { - prop_val: self._random_bright_color() for prop_val in node_properties_df[color_property].unique() + prop_val: self._random_themed_color(theme) for prop_val in node_properties_df[color_property].unique() } + # Add all the nodes for _, node in node_properties_df.iterrows(): title = f"Node ID: {node['nodeId']}\nLabels: {node['nodeLabels']}" if include_node_properties is not None: @@ -347,17 +403,22 @@ def visualize( net.add_node( int(node["nodeId"]), - value=node[pr_prop], + value=node[size_property], color=color, title=title, ) # Add all the relationships - net.add_edges(zip(relationships_df["sourceNodeId"], relationships_df["targetNodeId"])) + for _, rel in relationships_df.iterrows(): + if rel_weight_property is None: + net.add_edge(rel["sourceNodeId"], rel["targetNodeId"], title=f"Type: {rel['relationshipType']}") + else: + title = f"Type: {rel['relationshipType']}\n{rel_weight_property} = {rel['rel_weight_property']}" + net.add_edge(rel["sourceNodeId"], rel["targetNodeId"], title=title, value=rel[rel_weight_property]) return net.show(f"{self._name}.html") @staticmethod - def _random_bright_color() -> str: - h = random.randint(0, 255) / 255.0 - return "#%02X%02X%02X" % tuple(map(lambda x: int(x * 255), colorsys.hls_to_rgb(h, 0.7, 1.0))) + def _random_themed_color(theme) -> str: + l = 0.7 if theme == "dark" else 0.4 + return "#%02X%02X%02X" % tuple(map(lambda x: int(x * 255), colorsys.hls_to_rgb(random.random(), l, 1.0))) From 0703121a25420a97acca4122c871638486784ac9 Mon Sep 17 00:00:00 2001 From: Adam Schill Collberg Date: Fri, 18 Oct 2024 14:55:35 +0200 Subject: [PATCH 9/9] Add some `G.visualize` example calls to notebook --- examples/visualize.ipynb | 535 ++++++++++++++------------------------- 1 file changed, 195 insertions(+), 340 deletions(-) diff --git a/examples/visualize.ipynb b/examples/visualize.ipynb index d69e73c7b..4ac4e1d18 100644 --- a/examples/visualize.ipynb +++ b/examples/visualize.ipynb @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "26fb276b", "metadata": {}, "outputs": [], @@ -77,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "937f1476", "metadata": {}, "outputs": [], @@ -94,12 +94,175 @@ "gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB)" ] }, + { + "cell_type": "markdown", + "id": "a96b84b5", + "metadata": {}, + "source": [ + "## Built-in visualization" + ] + }, + { + "cell_type": "markdown", + "id": "aa1f778c", + "metadata": {}, + "source": [ + "### IMDB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68125987", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb = gds.graph.load_imdb()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acc92487", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb.visualize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21b17ba8", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb.visualize(directed=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2f6d726", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb.visualize(directed=False, theme=\"light\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29534589", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb.visualize(directed=False, node_count=200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b751ec2c", + "metadata": {}, + "outputs": [], + "source": [ + "G_imdb.drop()" + ] + }, + { + "cell_type": "markdown", + "id": "c6713a9d", + "metadata": {}, + "source": [ + "### Cora" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe00c465", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora = gds.graph.load_cora()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "258747fc", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora.visualize()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d5a0704", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora.visualize(color_property=\"subject\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7dfe9e57", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora.visualize(color_property=\"subject\", include_node_properties=[\"subject\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db635dd6", + "metadata": {}, + "outputs": [], + "source": [ + "gds.degree.mutate(G_cora, mutateProperty=\"deg\")\n", + "G_cora.visualize(color_property=\"subject\", size_property=\"deg\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6bc16f4", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora.visualize(color_property=\"subject\", center_nodes=[12350])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "447f1a2d", + "metadata": {}, + "outputs": [], + "source": [ + "G_cora.drop()" + ] + }, + { + "cell_type": "markdown", + "id": "56dc02c4", + "metadata": {}, + "source": [ + "## Using PyVis" + ] + }, { "cell_type": "markdown", "id": "b1a8ab78", "metadata": {}, "source": [ - "## Sampling Cora\n", + "### Sampling Cora\n", "\n", "Next we use the [built-in Cora loader](https://neo4j.com/docs/graph-data-science-client/current/common-datasets/#_cora) to get the data into GDS.\n", "The nodes in the Cora dataset is represented by academic papers, and the relationships connecting them are citations.\n", @@ -109,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "a575da60", "metadata": {}, "outputs": [], @@ -127,24 +290,25 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "428ac7b8", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Metadata for our loaded Cora graph `G`: Graph(name=cora, node_count=2708, relationship_count=5429)\n", - "Node labels present in `G`: ['Paper']\n" - ] - } - ], + "outputs": [], "source": [ "print(f\"Metadata for our loaded Cora graph `G`: {G}\")\n", "print(f\"Node labels present in `G`: {G.node_labels()}\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6aceb0a", + "metadata": {}, + "outputs": [], + "source": [ + "G.visualize(color_property=\"subject\")" + ] + }, { "cell_type": "markdown", "id": "3e8b8a83", @@ -159,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "3d116738", "metadata": {}, "outputs": [], @@ -178,19 +342,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "d66ea2d5", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of nodes in our sample: 406\n", - "Number of relationships in our sample: 532\n" - ] - } - ], + "outputs": [], "source": [ "print(f\"Number of nodes in our sample: {G_sample.node_count()}\")\n", "print(f\"Number of relationships in our sample: {G_sample.relationship_count()}\")" @@ -207,30 +362,10 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "3d80d653", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "mutateMillis 0\n", - "nodePropertiesWritten 406\n", - "ranIterations 20\n", - "didConverge False\n", - "centralityDistribution {'min': 0.14999961853027344, 'max': 2.27294921...\n", - "postProcessingMillis 1\n", - "preProcessingMillis 0\n", - "computeMillis 7\n", - "configuration {'mutateProperty': 'rank', 'jobId': '5ca450ff-...\n", - "Name: 0, dtype: object" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "gds.pageRank.mutate(G_sample, mutateProperty=\"rank\")" ] @@ -240,7 +375,7 @@ "id": "722caa7b", "metadata": {}, "source": [ - "## Exporting the sampled Cora graph\n", + "### Exporting the sampled Cora graph\n", "\n", "We can now export the topology and node properties of our sampled graph that we want to visualize.\n", "\n", @@ -249,129 +384,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "d3c76b25", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sourceNodeIdtargetNodeIdrelationshipType
03133631349CITES
131336686532CITES
2313361129442CITES
331349686532CITES
43135331336CITES
............
5273496131043CITES
5283496122883CITES
5291028799513CITES
5301028849513CITES
5317677631136631CITES
\n", - "

532 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " sourceNodeId targetNodeId relationshipType\n", - "0 31336 31349 CITES\n", - "1 31336 686532 CITES\n", - "2 31336 1129442 CITES\n", - "3 31349 686532 CITES\n", - "4 31353 31336 CITES\n", - ".. ... ... ...\n", - "527 34961 31043 CITES\n", - "528 34961 22883 CITES\n", - "529 102879 9513 CITES\n", - "530 102884 9513 CITES\n", - "531 767763 1136631 CITES\n", - "\n", - "[532 rows x 3 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "sample_topology_df = gds.graph.relationships.stream(G_sample)\n", "display(sample_topology_df)" @@ -392,129 +408,10 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "1c52e3b2", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nodeIdranksubject
01640.2459644.0
14340.1585002.0
216940.9612405.0
319490.2249126.0
419520.1500006.0
............
40111541030.3194983.0
40211541240.6277060.0
40311541690.1547840.0
40411542510.1876750.0
40511542760.2775000.0
\n", - "

406 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " nodeId rank subject\n", - "0 164 0.245964 4.0\n", - "1 434 0.158500 2.0\n", - "2 1694 0.961240 5.0\n", - "3 1949 0.224912 6.0\n", - "4 1952 0.150000 6.0\n", - ".. ... ... ...\n", - "401 1154103 0.319498 3.0\n", - "402 1154124 0.627706 0.0\n", - "403 1154169 0.154784 0.0\n", - "404 1154251 0.187675 0.0\n", - "405 1154276 0.277500 0.0\n", - "\n", - "[406 rows x 3 columns]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "sample_node_properties_df = gds.graph.nodeProperties.stream(\n", " G_sample,\n", @@ -529,46 +426,18 @@ "id": "ff81f977", "metadata": {}, "source": [ + "### Render visualization\n", + "\n", "Now that we have all the data we want to visualize, we can create a network with PyVis.\n", "We color each node according to its \"subject\", and size it according to its \"rank\"." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "93cb3bdf", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cora-sample.html\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "net = Network(notebook = True,\n", "cdn_resources=\"remote\",\n", @@ -619,7 +488,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "33523a0b", "metadata": {}, "outputs": [], @@ -630,22 +499,8 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" + "name": "python" } }, "nbformat": 4,