Knowledge Graphs using TigerGraph (version: 3.10)
TigerGraph NOMENCLATURE
- Vertex: Represents entities/nodes in the graph, such as people, places, or things.
- Edge: Represents relationships between vertices.
- Attribute: Both vertex and edges have properties/attributes.
- Path: Measured by the number of relationships in the path. E.g. for a 3 node path, the length would be 2.
TigerGraph CLOUD & GraphStudio SERVER SETUP
- Go to https://tgcloud.io/. Sign up & login.
- Go to https://tgcloud.io/app/clusters and click on
Create Cluster
.
Create a FREE cluster, and wait for 5 minutes.🍿 - Click on the newly created cluster.
->Tools
->GraphStudio
ACCESS MANAGEMENT
- Go to
https://tgcloud.io/app/clusters
-> select your newly created cluster
-> selectAccess Management
in the top bar - Select
Database Access
-> and click onAdd Database User
.
Add username and password. - Go to
Role Management
.
And give appropriate roles to this user.
GSQLShell — GSQL WEB SHELL
- Define vertex and edge schema
# Create vertices and edges
use global
CREATE VERTEX person (
PRIMARY_ID name STRING, born DATETIME
)
WITH primary_id_as_attribute="true"
CREATE VERTEX movie (
PRIMARY_ID title STRING, tagline STRING, released DATETIME
)
WITH primary_id_as_attribute="true"
CREATE UNDIRECTED EDGE ACTED_IN (FROM person, TO movie)
CREATE UNDIRECTED EDGE DIRECTED (FROM person, TO movie)
CREATE UNDIRECTED EDGE FRIEND (FROM person, TO person)
- Create graph
# Create graph
CREATE GRAPH movie_graph(person, movie, ACTED_IN, DIRECTED, FRIEND)
# View graph
LS
Now, we will repeat the above steps and add data using python package: pyTigerGraph
.
LAB CONFIGURATION
In this section we will install the pyTigerGraph
package and set up connection to GraphStudio server.
Fetch packages
!pip install pyTigerGraph
!pip install python-dotenv
import pyTigerGraph as tg
from dotenv import load_dotenv
import os
import pandas as pd
import json
import requests
Grab data files, create data frames from .csv
files, and prepare for upsertion
!git clone https://github.com/ArshdeepKaur/knowledge_graph_using_tigergraph
df_movies = pd.read_csv("/content/knowledge_graph_using_tigergraph/data/movies.csv", sep=";")
df_actors = pd.read_csv("/content/knowledge_graph_using_tigergraph/data/actors.csv", sep=";")
df_acted_in = pd.read_csv("/content/knowledge_graph_using_tigergraph/data/acted_in.csv", sep=";")
print(df_movies.info())
print(df_actors.info())
print(df_acted_in.info())
Look for attributes with NaN
/Null values
df_movies
null_df = df_movies[df_movies.isna().any(axis=1)]
print(null_df)
title tagline released
33 Something's Gotta Give NaN 2003
df_actors
null_df = df_actors[df_actors.isna().any(axis=1)]
print(null_df)
name born
103 Naomie Harris NaN
129 Paul Blythe NaN
130 Angela Scope NaN
131 Jessica Thompson NaN
132 James Thompson NaN
Replace all NaN
attributes with ‘’ empty strings or 0 values depending on their data type. Otherwise we will get an error: TigerGraphException: (‘Null JSON value is not supported.’, ‘REST-30200’)
df_movies = df_movies.fillna('') #tagline (str) has NaN
df_actors = df_actors.fillna(0) #born (int) has NaN
Create host string using subdomain
load_dotenv('/content/env', override=True)
TG_IP = os.getenv('TG_IP')
TG_CLUSTER = os.getenv('TG_CLUSTER')
TG_SUBDOMAIN = os.getenv('TG_SUBDOMAIN')
TG_DOMAIN=f"{TG_SUBDOMAIN}.i.tgcloud.io"
TG_SUBNET = os.getenv('TG_SUBNET')
TG_USERNAME = os.getenv('TG_USERNAME')
TG_PASSWORD = os.getenv('TG_PASSWORD')
TG_GRAPH = os.getenv('TG_GRAPH')
host_string = f"https://{TG_DOMAIN}"
Setup connection
conn = tg.TigerGraphConnection(host=host_string, username=TG_USERNAME, password=TG_PASSWORD)
print("Connected")
print(conn)
CREATING GRAPH SCHEMA USING pyTigerGraph
At this point we will drop the whole schema (previously created using GSQLShell
) and create everything afresh using pyTigerGraph
#Dangerous!
conn.gsql('''
USE GLOBAL
DROP ALL
''')
Create vertices and edges (again!)
conn.gsql('''
USE GLOBAL
CREATE VERTEX person (
PRIMARY_ID name STRING, born INT
)
WITH primary_id_as_attribute="true"
CREATE VERTEX movie (
PRIMARY_ID title STRING, tagline STRING, released INT
)
WITH primary_id_as_attribute="true"
CREATE UNDIRECTED EDGE ACTED_IN (FROM person, TO movie)
CREATE UNDIRECTED EDGE DIRECTED (FROM person, TO movie)
CREATE UNDIRECTED EDGE FRIEND (FROM person, TO person)
''')
Create Graph
conn.gsql('''
CREATE GRAPH movie_graph(person, movie, ACTED_IN, DIRECTED, FRIEND)
''')
Fetch access token
conn.graphname = TG_GRAPH
secret = conn.createSecret()
token = conn.getToken(secret, setToken=True)
print(conn, conn.graphname)
Verify the schema using conn
object
print(conn.getSchema())
print(conn.gsql("LS"))
print(conn.getVertexTypes())
print(conn.getEdgeTypes())
UPSERTING DATA TO KNOWLEDGE GRAPH USING pyTigerGraph
Upsert (insert + update) themovie
vertex
v_movies = conn.upsertVertexDataFrame(df_movies, 'movie', v_id='title', attributes={'title':'title', 'tagline': 'tagline', 'released': 'released'})
print(str(v_movies) + " movie VERTICES upserted")
Upsert the person
vertex
v_actors = conn.upsertVertexDataFrame(df_actors, 'person', v_id='name', attributes={'name':'name', 'born': 'born'})
print(str(v_actors) + " person VERTICES upserted")
Upsert ACTED_IN
edge
e_ACTED_IN = conn.upsertEdgeDataFrame(df=df_acted_in, sourceVertexType='person', edgeType="ACTED_IN", targetVertexType='movie', from_id='actor', to_id='movie', attributes={}
)
print(str(e_ACTED_IN) + " ACTED_IN EDGES upserted")
QUERYING KNOWLEDGE GRAPHS USING pyTigerGraph
A simple 1-hop parameterized query (without accumulators):
The query starts by seeding a vertex set start
with the person
vertex identified by parameter p passed in from the query call.
Next, the SELECT
statement describes a 1-hop traversal according to the pattern described in the FROM
clause:
start:s -(ACTED_IN:e)- movie:tgt
# simple select query
query = '''
USE GRAPH movie_graph
DROP QUERY select_all_movies_for_person
CREATE QUERY select_all_movies_for_person (VERTEX<person> p) {
start = {p};
result = SELECT tgt
FROM start:s -(ACTED_IN:e)- movie:tgt;
PRINT result[result.title];
}
INSTALL QUERY select_all_movies_for_person
'''
results = conn.gsql(query)
print(results)
parameters = {
"p": "Tom Hanks"
}
results = conn.runInstalledQuery("select_all_movies_for_person", params=parameters)
print(json.dumps(results, indent=2))
2-hop parameterized query with accumulators:
Accumulators serve as the runtime attributes (properties) attachable to each vertex visited during our traversal on the graph.
ACCUM
executes its statement(s) once for each matched edge (or path) of the FROM
clause pattern. Further, ACCUM
parallelly executes its statements for all the matches.
POST-ACCUM
executes its statement(s) once for each involved vertex. Note that each statement within the POST-ACCUM
clause can refer to either source vertices or target vertices but not both.
In this example we will explore different accumulator variables (ListAccum
, SumAccum
, AvgAccum
, OrAccum
), data types (INT
, STRING
, BOOL
), scopes (@@
&@
: @@
means the scope is global; @
means the scope is local/vertex-attached.)
- The list of movies that Tom Hanks has worked in:
ListAccum<STRING> @movies;
- The count of movies that Tom Hanks has worked in:
SumAccum <INT> @@count_movies;
- The list costars that he has worked in:
ListAccum<STRING> @costars;
- The count of costars that he has worked in:
SumAccum <INT> @@count_costars;
- Their average age:
AvgAccum @@avg_age;
query = '''
USE GRAPH movie_graph
DROP QUERY movies_of_costars
CREATE QUERY movies_of_costars (VERTEX<person> p, INT current_year = 2024) {
OrAccum @visited = FALSE;
SumAccum <INT> @@count_movies;
SumAccum <INT> @@count_costars;
AvgAccum @@avg_age;
ListAccum<STRING> @movies;
ListAccum<STRING> @costars;
start = {p};
first_neighbors = SELECT tgt_m
FROM start:s -(ACTED_IN:e)- movie:tgt_m
ACCUM tgt_m.@visited += TRUE, s.@visited += TRUE, tgt_m.@movies += tgt_m.title
POST-ACCUM (tgt_m) @@count_movies += 1;
second_neighbors = SELECT tgt_p
FROM person:tgt_p -(:e)- first_neighbors
WHERE tgt_p.@visited == FALSE
POST-ACCUM (tgt_p) @@count_costars += 1, @@avg_age += (current_year - tgt_p.born), tgt_p.@costars += tgt_p.name;
PRINT "titles";
PRINT first_neighbors[first_neighbors.title];
PRINT "titles using @movies accumulator";
PRINT first_neighbors[first_neighbors.@movies];
PRINT "costars";
PRINT second_neighbors[second_neighbors.name];
PRINT "costars using @costars accumulator";
PRINT second_neighbors[second_neighbors.@costars];
PRINT "count of movies";
PRINT @@count_movies;
PRINT "count of costars";
PRINT @@count_costars;
PRINT "average age of costars";
PRINT @@avg_age;
}
INSTALL QUERY movies_of_costars
'''
results = conn.gsql(query)
print(results)
parameters = {
"p": "Tom Hanks"
}
results = conn.runInstalledQuery("movies_of_costars", params=parameters)
print(json.dumps(results, indent=2))
Acknowledgements & References
- Jon Herke at TigerGraph
Setup Connection to TigerGraph 3.0 - Andrew Ng at DeepLearning.AI,
Andreas Kollegger at Neo4j
Short course on Knowledge Graphs for RAG - TigerGraph community:
Need correct Json syntax to upsert a vertex with no attributes - TigerGraph documentation:
Develop Parameterized Queries — GSQL 101
Accumulators Tutorial