Knowledge Graphs using TigerGraph (version: 3.10)

Arshdeep Kaur

5 min readApr 29, 2024

TigerGraph NOMENCLATURE

Vertex: Represents entities/nodes in the graph, such as people, places, or things.
Edge: Represents relationships between vertices.
Attribute: Both vertex and edges have properties/attributes.
Path: Measured by the number of relationships in the path. E.g. for a 3 node path, the length would be 2.

TigerGraph CLOUD & GraphStudio SERVER SETUP

Go to https://tgcloud.io/. Sign up & login.
Go to https://tgcloud.io/app/clusters and click on Create Cluster.
Create a FREE cluster, and wait for 5 minutes.🍿
Click on the newly created cluster.
-> Tools
-> GraphStudio

ACCESS MANAGEMENT

Go to https://tgcloud.io/app/clusters
-> select your newly created cluster
-> select Access Management in the top bar
Select Database Access-> and click on Add Database User.
Add username and password.
Go to Role Management.And give appropriate roles to this user.

GSQLShell — GSQL WEB SHELL

Define vertex and edge schema

# Create vertices and edges
use global 
CREATE VERTEX person (
  PRIMARY_ID name STRING, born DATETIME 
)
WITH primary_id_as_attribute="true"

CREATE VERTEX movie (
  PRIMARY_ID title STRING, tagline STRING, released DATETIME
)
WITH primary_id_as_attribute="true"

CREATE UNDIRECTED EDGE ACTED_IN (FROM person, TO movie)

CREATE UNDIRECTED EDGE DIRECTED (FROM person, TO movie)

CREATE UNDIRECTED EDGE FRIEND (FROM person, TO person)

Create graph

# Create graph
CREATE GRAPH movie_graph(person, movie, ACTED_IN, DIRECTED, FRIEND)

# View graph
LS

Now, we will repeat the above steps and add data using python package: pyTigerGraph.

LAB CONFIGURATION

In this section we will install the pyTigerGraph package and set up connection to GraphStudio server.

Fetch packages

!pip install pyTigerGraph
!pip install python-dotenv
import pyTigerGraph as tg
from dotenv import load_dotenv
import os
import pandas as pd
import json
import requests

Grab data files, create data frames from .csv files, and prepare for upsertion

!git clone https://github.com/ArshdeepKaur/knowledge_graph_using_tigergraph

df_movies = pd.read_csv("/content/knowledge_graph_using_tigergraph/data/movies.csv", sep=";")
df_actors = pd.read_csv("/content/knowledge_graph_using_tigergraph/data/actors.csv", sep=";")
df_acted_in = pd.read_csv("/content/knowledge_graph_using_tigergraph/data/acted_in.csv", sep=";")

print(df_movies.info())
print(df_actors.info())
print(df_acted_in.info())

Look for attributes with NaN/Null values

df_movies

null_df = df_movies[df_movies.isna().any(axis=1)]
print(null_df)

                     title tagline  released
33  Something's Gotta Give     NaN      2003

df_actors

null_df = df_actors[df_actors.isna().any(axis=1)]
print(null_df)

                 name  born
103     Naomie Harris   NaN
129       Paul Blythe   NaN
130      Angela Scope   NaN
131  Jessica Thompson   NaN
132    James Thompson   NaN

Replace all NaN attributes with ‘’ empty strings or 0 values depending on their data type. Otherwise we will get an error: TigerGraphException: (‘Null JSON value is not supported.’, ‘REST-30200’)

df_movies = df_movies.fillna('') #tagline (str) has NaN
df_actors = df_actors.fillna(0) #born (int) has NaN

Create host string using subdomain

load_dotenv('/content/env', override=True)

TG_IP = os.getenv('TG_IP')
TG_CLUSTER = os.getenv('TG_CLUSTER')
TG_SUBDOMAIN = os.getenv('TG_SUBDOMAIN')
TG_DOMAIN=f"{TG_SUBDOMAIN}.i.tgcloud.io"
TG_SUBNET = os.getenv('TG_SUBNET')
TG_USERNAME = os.getenv('TG_USERNAME')
TG_PASSWORD = os.getenv('TG_PASSWORD')
TG_GRAPH = os.getenv('TG_GRAPH')
host_string = f"https://{TG_DOMAIN}"

Setup connection

conn = tg.TigerGraphConnection(host=host_string, username=TG_USERNAME, password=TG_PASSWORD)
print("Connected")
print(conn)

CREATING GRAPH SCHEMA USING pyTigerGraph

At this point we will drop the whole schema (previously created using GSQLShell) and create everything afresh using pyTigerGraph

#Dangerous!
conn.gsql('''
USE GLOBAL
DROP ALL
''')

Create vertices and edges (again!)

conn.gsql('''
USE GLOBAL
CREATE VERTEX person (
  PRIMARY_ID name STRING, born INT
)
WITH primary_id_as_attribute="true"

CREATE VERTEX movie (
  PRIMARY_ID title STRING, tagline STRING, released INT
)
WITH primary_id_as_attribute="true"

CREATE UNDIRECTED EDGE ACTED_IN (FROM person, TO movie)
CREATE UNDIRECTED EDGE DIRECTED (FROM person, TO movie)
CREATE UNDIRECTED EDGE FRIEND (FROM person, TO person)
''')

Create Graph

conn.gsql('''
CREATE GRAPH movie_graph(person, movie, ACTED_IN, DIRECTED, FRIEND)
''')

Fetch access token

conn.graphname = TG_GRAPH
secret = conn.createSecret()
token = conn.getToken(secret, setToken=True)

print(conn, conn.graphname)

Verify the schema using conn object

print(conn.getSchema())
print(conn.gsql("LS"))
print(conn.getVertexTypes())
print(conn.getEdgeTypes())

UPSERTING DATA TO KNOWLEDGE GRAPH USING pyTigerGraph

Upsert (insert + update) themovie vertex

v_movies = conn.upsertVertexDataFrame(df_movies, 'movie', v_id='title', attributes={'title':'title', 'tagline': 'tagline', 'released': 'released'})
print(str(v_movies) + " movie VERTICES upserted")

Upsert the person vertex

v_actors = conn.upsertVertexDataFrame(df_actors, 'person', v_id='name', attributes={'name':'name', 'born': 'born'})
print(str(v_actors) + " person VERTICES upserted")

Upsert ACTED_IN edge

e_ACTED_IN = conn.upsertEdgeDataFrame(df=df_acted_in, sourceVertexType='person', edgeType="ACTED_IN", targetVertexType='movie', from_id='actor', to_id='movie', attributes={}
)
print(str(e_ACTED_IN) + " ACTED_IN EDGES upserted")

QUERYING KNOWLEDGE GRAPHS USING `pyTigerGraph`

A simple 1-hop parameterized query (without accumulators):

The query starts by seeding a vertex set start with the person vertex identified by parameter p passed in from the query call.

Next, the SELECT statement describes a 1-hop traversal according to the pattern described in the FROM clause:

start:s -(ACTED_IN:e)- movie:tgt

# simple select query
query = '''
    USE GRAPH movie_graph
    DROP QUERY select_all_movies_for_person
    CREATE QUERY select_all_movies_for_person (VERTEX<person> p) {
        start = {p};
        result = SELECT tgt
            FROM start:s -(ACTED_IN:e)- movie:tgt;
        PRINT result[result.title];
    }
INSTALL QUERY select_all_movies_for_person
'''

results = conn.gsql(query)
print(results)

parameters = {
    "p": "Tom Hanks"
}
results = conn.runInstalledQuery("select_all_movies_for_person", params=parameters)
print(json.dumps(results, indent=2))

2-hop parameterized query with accumulators:
Accumulators serve as the runtime attributes (properties) attachable to each vertex visited during our traversal on the graph.

ACCUM executes its statement(s) once for each matched edge (or path) of the FROM clause pattern. Further, ACCUM parallelly executes its statements for all the matches.

POST-ACCUM executes its statement(s) once for each involved vertex. Note that each statement within the POST-ACCUM clause can refer to either source vertices or target vertices but not both.

In this example we will explore different accumulator variables (ListAccum, SumAccum, AvgAccum, OrAccum), data types (INT, STRING, BOOL), scopes (@@&@: @@ means the scope is global; @ means the scope is local/vertex-attached.)

The list of movies that Tom Hanks has worked in:
ListAccum<STRING> @movies;
The count of movies that Tom Hanks has worked in:
SumAccum <INT> @@count_movies;
The list costars that he has worked in:
ListAccum<STRING> @costars;
The count of costars that he has worked in:
SumAccum <INT> @@count_costars;
Their average age:
AvgAccum @@avg_age;

query = '''
USE GRAPH movie_graph

DROP QUERY movies_of_costars

CREATE QUERY movies_of_costars (VERTEX<person> p, INT current_year = 2024) {
    OrAccum  @visited = FALSE;
    SumAccum <INT> @@count_movies;
    SumAccum <INT> @@count_costars;
    AvgAccum @@avg_age;
    ListAccum<STRING> @movies;
    ListAccum<STRING> @costars;
    
    start = {p};

    first_neighbors = SELECT tgt_m
        FROM start:s -(ACTED_IN:e)- movie:tgt_m
        ACCUM tgt_m.@visited += TRUE, s.@visited += TRUE, tgt_m.@movies += tgt_m.title
        POST-ACCUM (tgt_m) @@count_movies += 1;

    second_neighbors = SELECT tgt_p 
        FROM person:tgt_p -(:e)- first_neighbors
        WHERE tgt_p.@visited == FALSE 
        POST-ACCUM (tgt_p) @@count_costars += 1, @@avg_age += (current_year - tgt_p.born), tgt_p.@costars += tgt_p.name;

    PRINT "titles";
    PRINT first_neighbors[first_neighbors.title];
    PRINT "titles using @movies accumulator";
    PRINT first_neighbors[first_neighbors.@movies];
    PRINT "costars";
    PRINT second_neighbors[second_neighbors.name];
    PRINT "costars using @costars accumulator";
    PRINT second_neighbors[second_neighbors.@costars];
    PRINT "count of movies";
    PRINT @@count_movies;
    PRINT "count of costars";
    PRINT @@count_costars;
    PRINT "average age of costars";
    PRINT @@avg_age;
}

INSTALL QUERY movies_of_costars
'''

results = conn.gsql(query)
print(results)

parameters = {
    "p": "Tom Hanks"
}
results = conn.runInstalledQuery("movies_of_costars", params=parameters)
print(json.dumps(results, indent=2))

Acknowledgements & References