In [1]:
import os
import time
os.chdir(os.path.dirname(INSERT_PATH_HERE))

In [3]:
import numpy as np
import pandas as pd

from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.embedders import Word2Vec
from pyrdf2vec.graphs import KG
from pyrdf2vec.walkers import RandomWalker
from pyrdf2vec.samplers import PageRankSampler

import matplotlib.pyplot as plt

In [4]:
st = time.process_time()
# Load KG
kg = KG('artists_graph.nt')

# extract all entities
# get all papers from the graph
from rdflib import Graph, RDF, URIRef
g = Graph()
g.parse('artists_graph.nt')
entities = set()
for e in g.subjects():
    s = str(e)
    if s.startswith("http"):
        entities.add(s)
for e in g.objects():
    s = str(e)
    if s.startswith("http"):
        entities.add(s)

lstentities = []
for e in entities:
    lstentities.append(e)

print(str(len(lstentities)) + " entities found")

1786 entities found


In [5]:
# We specify the depth and maximum number of walks per entity
random_walker = RandomWalker(4, 500)
walkers = []
for i in range(1):
    walkers.append(random_walker)

In [6]:
transformer = RDF2VecTransformer(walkers=walkers, embedder=Word2Vec(sg=1, vector_size=200, hs=1, window=5, min_count=0))
st = time.process_time()
embeddings,_ = transformer.fit_transform(kg, lstentities)
et = time.process_time()
duration = str(et-st)
print("embeddings computed in " + duration + " seconds")

embeddings computed in 19.1875 seconds


In [7]:
print(len(embeddings))

1786


In [8]:
# compute average relation vector for http://dbpedia.org/ontology/bandMember
relations = np.empty([0,200])
pred = URIRef("http://dbpedia.org/ontology/bandMember")
for s,p,o in g.triples((None,pred,None)):
    sVector = embeddings[lstentities.index(str(s))]
    oVector = embeddings[lstentities.index(str(o))]
    rVector = oVector - sVector
    relations = np.vstack([relations,rVector])
    
relationVector = relations.mean(axis=0)

In [11]:
entity = "http://dbpedia.org/resource/Soulive"
sVector = embeddings[lstentities.index(entity)]
oVectorPrediction = sVector + relationVector
print(oVectorPrediction)

[-0.10245946 -0.28592306  0.40919762  0.26752798  0.64695854 -0.04609001
 -0.4891275   0.41482562 -0.2439612  -0.36260311 -0.49105147 -0.08911946
  0.04959735 -0.29058747 -0.02247287  0.03881578  0.00350746  0.095191
 -0.53222076 -0.42890648  0.15434494 -0.30577704  0.04553999  0.00230763
 -0.17380755  0.04767534  0.07255753  0.38884306  0.21884037  0.53863733
  0.2128529  -0.30911234  0.21667001 -0.08320978 -0.33849345  0.53412074
 -0.09821881 -0.07251939 -0.00840029 -0.13569858  0.14565726  0.43100204
 -0.48904073  0.13740144 -0.20214401  0.17101105  0.28236279 -0.43846874
 -0.36807893  0.21294125  0.16022821 -0.30039877 -0.17659824  0.58144793
 -0.0989891   0.73272628 -0.29497498 -0.34761093  0.39642645  0.39841372
  0.14220494  0.5383226   0.16997071  0.35949316 -0.20391229 -0.00343997
 -0.41428877 -0.44010644  0.42339169 -0.33063893  0.13081799  0.28878486
  0.14159549  0.14311199  0.18231186 -0.0593251   0.00636987  0.14918074
 -0.05519574  0.10147541 -0.09620431  0.3965452   0.2

In [12]:
# find candidates
from sklearn.neighbors import NearestNeighbors

dfembeddings = pd.DataFrame.from_records(embeddings)
knn = NearestNeighbors(n_neighbors=10, algorithm='auto', metric='cosine')
knn.fit(dfembeddings)

dfVectorPrediction = pd.DataFrame(oVectorPrediction).transpose()

for index in knn.kneighbors(dfVectorPrediction, 10, return_distance = False)[0]:
    print(lstentities[index])

http://dbpedia.org/resource/Soulive
http://dbpedia.org/resource/Joshua_Redman
http://dbpedia.org/resource/Rashawn_Ross
http://dbpedia.org/resource/Lettuce_(band)
http://dbpedia.org/resource/Funk_Trek
http://dbpedia.org/resource/Rustic_Overtones
http://dbpedia.org/resource/Fred_Wesley__Fred_Wesley__1
http://dbpedia.org/resource/Paranoid_Social_Club
http://dbpedia.org/resource/Kaotiko
http://dbpedia.org/resource/J._Blackfoot__J._Blackfoot__1
