In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, export_svgs, show

import sys
sys.path.insert(0, "../")

from pubnet.graph.nx import lsm_to_nx
In [2]:
output_notebook()
Loading BokehJS ...
In [3]:
lsm_prefix = "../res/build_1/global_net_1904"
In [4]:
G = lsm_to_nx(lsm_prefix)
pubnet.graph.nx      INFO    @ 05/09/19 15:01:36: Counting pairs
pubnet.graph.nx      INFO    @ 05/09/19 15:02:48: 32425316 pairs found.
pubnet.graph.nx      INFO    @ 05/09/19 15:02:48: Load network into memory.
100%|██████████| 32425316/32425316 [33:55<00:00, 15931.65it/s] 

Number of nodes and edges:

In [5]:
len(G.nodes)
Out[5]:
3141000
In [6]:
len(G.edges)
Out[6]:
32425316

Different type nodes

In [7]:
from collections import Counter
In [8]:
c_nodes = Counter([G.nodes[n]['type'] for n in G.nodes])
In [9]:
c_nodes
Out[9]:
Counter({'Chemical': 2700653,
         'DNAMutation': 64116,
         'SNP': 43009,
         'ProteinMutation': 102420,
         'Disease': 11119,
         'Species': 11107,
         'Gene': 208576})
In [10]:
from math import pi

from bokeh.palettes import Pastel1
from bokeh.plotting import figure
from bokeh.transform import cumsum

x = c_nodes
data = pd.Series(x).reset_index(name='value').rename(columns={'index':'type'})
data['angle'] = data['value']/data['value'].sum() * 2*pi
data['color'] = Pastel1[len(x)]

p = figure(plot_height=350, title="Pie Chart", toolbar_location=None,
           tools="hover", tooltips="@type: @value", x_range=(-0.5, 1.0))

p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', legend='type', source=data)

p.axis.axis_label=None
p.axis.visible=False
p.grid.grid_line_color = None

p.output_backend = "svg"
export_svgs(p, filename="node_types_pie.svg")

show(p)

Different type edges

In [11]:
c_edges = Counter([(G.nodes[e[0]]['type'], G.nodes[e[1]]['type']) for e in G.edges])
In [12]:
# merge same items
c_edges_ = {}
for k, v in c_edges.items():
    k = tuple(sorted(k))
    c_edges_.setdefault(k, 0)
    c_edges_[k] += v
In [13]:
c_edges_
Out[13]:
{('Chemical', 'Chemical'): 14506073,
 ('Chemical', 'DNAMutation'): 79409,
 ('Chemical', 'SNP'): 29004,
 ('Chemical', 'Gene'): 4073981,
 ('Chemical', 'Species'): 1980305,
 ('Chemical', 'Disease'): 3386139,
 ('Chemical', 'ProteinMutation'): 257445,
 ('DNAMutation', 'Gene'): 102469,
 ('DNAMutation', 'Species'): 33984,
 ('DNAMutation', 'Disease'): 70426,
 ('DNAMutation', 'SNP'): 32999,
 ('DNAMutation', 'DNAMutation'): 99822,
 ('DNAMutation', 'ProteinMutation'): 73021,
 ('Gene', 'SNP'): 107647,
 ('Disease', 'SNP'): 69901,
 ('ProteinMutation', 'Species'): 63857,
 ('Gene', 'ProteinMutation'): 136806,
 ('Disease', 'ProteinMutation'): 80341,
 ('ProteinMutation', 'ProteinMutation'): 199314,
 ('Disease', 'Gene'): 1893276,
 ('Disease', 'Species'): 382825,
 ('Disease', 'Disease'): 1488961,
 ('ProteinMutation', 'SNP'): 8748,
 ('SNP', 'SNP'): 113506,
 ('SNP', 'Species'): 20746,
 ('Gene', 'Species'): 596880,
 ('Species', 'Species'): 159924,
 ('Gene', 'Gene'): 2377507}
In [14]:
node_types = sorted(list(c_nodes.keys()))
m_ = np.zeros((len(node_types), len(node_types)))
edge_counts = pd.DataFrame(m_)
edge_counts.index = node_types
edge_counts.columns = node_types
for k, v in c_edges_.items():
    edge_counts.loc[k[0], k[1]] = v
edge_counts.index.name = 'type1'
edge_counts.columns.name = 'type2'
In [15]:
from bokeh.models import LogColorMapper, BasicTicker, PrintfTickFormatter, ColorBar
from bokeh.plotting import figure
from bokeh.sampledata.unemployment1948 import data

data = edge_counts

node_types = list(data.index)

# reshape to 1D array or rates with a month and year for each row.
df = pd.DataFrame(data.stack(), columns=['count']).reset_index()
df = df[df!=0].dropna()

# this is the colormap from the original NYTimes plot
colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
mapper = LogColorMapper(palette=colors, low=df['count'].min(), high=df['count'].max())

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="Number of edges",
           x_range=node_types, y_range=node_types,
           x_axis_location="above", plot_width=700, plot_height=600,
           tools=TOOLS, toolbar_location='below',
           tooltips=[('type', '@type1 - @type2'), ('count', '@count')])

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
#p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3

p.rect(x="type1", y="type2", width=1, height=1,
       source=df,
       fill_color={'field': 'count', 'transform': mapper},
       line_color=None)

color_bar = ColorBar(color_mapper=mapper,
                     ticker=BasicTicker(desired_num_ticks=len(colors)),
                     formatter=PrintfTickFormatter(format="%d"),
                     label_standoff=15, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')

p.output_backend = "svg"
export_svgs(p, filename="edge_types_heatmap.svg")


show(p)      # show the plot

Node degree distribution

In [16]:
nodes_degree = sorted([d for n, d in G.degree()], reverse=True)
In [17]:
hist, bin_edges = np.histogram(nodes_degree, bins=200)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="Node degree distribution",
           tools=TOOLS,
           background_fill_color="#fafafa",
           plot_width=600, plot_height=300,
           y_axis_type="log",
           x_axis_type="log")

p.circle(hist, bin_edges[:-1], alpha=0.5, fill_color="navy", size=14)

p.y_range.start = 0

p.xaxis.axis_label = 'Degree'
p.yaxis.axis_label = 'Count'
p.grid.grid_line_color="white"

p.output_backend = "svg"
export_svgs(p, filename="node_degree_dist.svg")

show(p)

Edge strength and co-occurence distribution

In [18]:
edge_strength = []
edge_coo = []

for e in G.edges:
    e_ = G.edges[e]
    edge_strength.append(e_['strength'])
    edge_coo.append(e_['coo'])
In [19]:
hist, bin_edges = np.histogram(edge_strength, bins=200)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="Edge strength distribution",
           tools=TOOLS,
           background_fill_color="#fafafa",
           plot_width=600, plot_height=300,
           y_axis_type="log",
           x_axis_type="log")

p.circle(hist, bin_edges[:-1], alpha=0.5, fill_color="orange", size=14)

p.y_range.start = 0

p.xaxis.axis_label = 'Strength'
p.yaxis.axis_label = 'Count'
p.grid.grid_line_color="white"

p.output_backend = "svg"
export_svgs(p, filename="edge_strength_dist.svg")

show(p)
In [20]:
hist, bin_edges = np.histogram(edge_coo, bins=200)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="Edge co-occurence distribution",
           tools=TOOLS,
           background_fill_color="#fafafa",
           plot_width=600, plot_height=300,
           y_axis_type="log",
           x_axis_type="log")

p.circle(hist, bin_edges[:-1], alpha=0.5, fill_color="blue", size=14)

p.y_range.start = 0

p.xaxis.axis_label = 'Co-occurence'
p.yaxis.axis_label = 'Count'
p.grid.grid_line_color="white"

p.output_backend = "svg"
export_svgs(p, filename="edge_coo_dist.svg")

show(p)

Bio-concept Set Enrichment Analysis

Example data: SAGD_00055

In [21]:
from pubnet.graph.bsea import BSEA, EntrezEnsemblConvert
In [22]:
input_csv = "../data/SAGD_00055.csv"
exp = pd.read_csv(input_csv)
exp.columns = ["ensembl_id"] + list(exp.columns)[1:]
In [23]:
sig = exp[exp.padj <= 0.05]
test_genes = list(sig.ensembl_id)
len(test_genes)
Out[23]:
54
In [24]:
cvt = EntrezEnsemblConvert()
test_entrez = cvt.ensembl2entrez(test_genes)
pubnet.graph.bsea    INFO    @ 05/09/19 15:39:50: Perform gene ID conversion: from ensembl id to entrez id.
querying 1-54...done.
pubnet.graph.bsea    WARNING @ 05/09/19 15:40:01: 29 gene conversion failed, failed genes:
pubnet.graph.bsea    WARNING @ 05/09/19 15:40:01: ENSG00000131002,ENSG00000229807,ENSG00000233864,ENSG00000099725,ENSG00000176728,ENSG00000260197,ENSG00000206159,ENSG00000241859,ENSG00000278847,ENSG00000273906,ENSG00000227289,ENSG00000228764,ENSG00000232226,ENSG00000229236,ENSG00000225117,ENSG00000270641,ENSG00000267793,ENSG00000232348,ENSG00000224060,ENSG00000229163,ENSG00000215580,ENSG00000231535,ENSG00000227494,ENSG00000233070,ENSG00000228787,ENSG00000229238,ENSG00000130600,ENSG00000217896,ENSG00000261600
Finished.
2 input query terms found no hit:
	['ENSG00000233864', 'ENSG00000232348']
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
In [25]:
bsea = BSEA(G, input_types=['Gene'], target_types='-')
In [26]:
genes = [str(i) for i in test_entrez]
In [27]:
bsea.enrich(genes)
pubnet.graph.bsea    INFO    @ 05/09/19 15:40:05: Perform gene set enrichment analyze.
pubnet.graph.bsea    INFO    @ 05/09/19 15:40:05: input conceptions: 25	valid conceptions: 23	possible terms: 2922
100%|██████████| 2922/2922 [00:18<00:00, 154.52it/s]
pubnet.graph.bsea    INFO    @ 05/09/19 15:40:24: Done
In [28]:
bsea.multiple_test_corretion()
pubnet.graph.bsea    INFO    @ 05/09/19 15:40:24: Multiple test correction with fdr_bh method.
pubnet.graph.bsea    INFO    @ 05/09/19 15:40:25: Done
In [47]:
bsea.filter(by='padj', threshold=0.001)
pubnet.graph.bsea    INFO    @ 05/09/19 15:59:20: filter the enrichment result with condition: padj <= 0.001
In [48]:
df = bsea.enrichment_table
df.shape[0]
Out[48]:
383
In [50]:
df.groupby('term_type').count()['term_ID']
Out[50]:
term_type
Chemical           161
DNAMutation         24
Disease            116
ProteinMutation     36
SNP                 15
Species             31
Name: term_ID, dtype: int64
In [56]:
df[df.term_type != 'Species'].head(10)
Out[56]:
term_ID term_type url study_count n_study population_count n_population gene_ratio background_ratio odd_ratio pvalue padj related_genes
1565 MESH:D011471 Disease https://meshb.nlm.nih.gov/record/ui?ui=D011471 17 23 7000 208576 0.739130 0.033561 22.023553 7.102240e-21 5.188186e-18 8653 8287 22829 7404 7544 83259 9087 90665 426...
1469 MESH:D009369 Disease https://meshb.nlm.nih.gov/record/ui?ui=D009369 20 23 32654 208576 0.869565 0.156557 5.554310 8.499019e-14 4.966827e-11 8653 8287 9086 7404 7544 83259 9087 90665 4267...
1564 MESH:D011470 Disease https://meshb.nlm.nih.gov/record/ui?ui=D011470 8 23 1473 208576 0.347826 0.007062 49.251985 2.710133e-12 1.131287e-09 8653 8287 7404 7544 9087 1116 6736 7543
1383 MESH:D007938 Disease https://meshb.nlm.nih.gov/record/ui?ui=D007938 11 23 6664 208576 0.478261 0.031950 14.969048 3.328331e-11 1.215673e-08 8653 7404 4267 1116 6736 7543 9189 1654 7403 8...
311 CHEBI:33704 Chemical https://www.ebi.ac.uk/chebi/searchId.do?chebiI... 16 23 25109 208576 0.695652 0.120383 5.778659 2.050363e-10 5.446509e-08 8653 9086 7404 7544 83259 4267 1116 246777 673...
1013 MESH:D001943 Disease https://meshb.nlm.nih.gov/record/ui?ui=D001943 12 23 11341 208576 0.521739 0.054373 9.595473 5.103941e-10 1.147209e-07 8287 9087 90665 4267 1116 6736 7543 1117 1654 ...
1460 MESH:D009223 Disease https://meshb.nlm.nih.gov/record/ui?ui=D009223 7 23 1802 208576 0.304348 0.008640 35.227332 7.715694e-10 1.610376e-07 8653 8287 7404 7544 9087 1116 6736
992 MESH:D001523 Disease https://meshb.nlm.nih.gov/record/ui?ui=D001523 10 23 7310 208576 0.434783 0.035047 12.405638 2.089974e-09 4.071269e-07 22829 7404 83259 4267 1116 6736 7543 1654 8242...
1374 MESH:D007713 Disease https://meshb.nlm.nih.gov/record/ui?ui=D007713 4 23 152 208576 0.173913 0.000729 238.645309 2.374418e-09 4.336280e-07 7544 4267 6736 7543
941 MESH:D000596 Chemical https://meshb.nlm.nih.gov/record/ui?ui=D000596 10 23 7876 208576 0.434783 0.037761 11.514121 4.264456e-09 7.329847e-07 8653 8287 9087 4267 1116 6736 1117 1654 8242 6373
In [52]:
df[df.term_type == 'Disease'].head(5)
Out[52]:
term_ID term_type url study_count n_study population_count n_population gene_ratio background_ratio odd_ratio pvalue padj related_genes
1565 MESH:D011471 Disease https://meshb.nlm.nih.gov/record/ui?ui=D011471 17 23 7000 208576 0.739130 0.033561 22.023553 7.102240e-21 5.188186e-18 8653 8287 22829 7404 7544 83259 9087 90665 426...
1469 MESH:D009369 Disease https://meshb.nlm.nih.gov/record/ui?ui=D009369 20 23 32654 208576 0.869565 0.156557 5.554310 8.499019e-14 4.966827e-11 8653 8287 9086 7404 7544 83259 9087 90665 4267...
1564 MESH:D011470 Disease https://meshb.nlm.nih.gov/record/ui?ui=D011470 8 23 1473 208576 0.347826 0.007062 49.251985 2.710133e-12 1.131287e-09 8653 8287 7404 7544 9087 1116 6736 7543
1383 MESH:D007938 Disease https://meshb.nlm.nih.gov/record/ui?ui=D007938 11 23 6664 208576 0.478261 0.031950 14.969048 3.328331e-11 1.215673e-08 8653 7404 4267 1116 6736 7543 9189 1654 7403 8...
1013 MESH:D001943 Disease https://meshb.nlm.nih.gov/record/ui?ui=D001943 12 23 11341 208576 0.521739 0.054373 9.595473 5.103941e-10 1.147209e-07 8287 9087 90665 4267 1116 6736 7543 1117 1654 ...
In [53]:
df[df.term_type == 'SNP'].head(5)
Out[53]:
term_ID term_type url study_count n_study population_count n_population gene_ratio background_ratio odd_ratio pvalue padj related_genes
2901 rs3212292 SNP https://www.ncbi.nlm.nih.gov/snp/rs3212292 2 23 2 208576 0.086957 0.000010 9068.521739 1.163118e-08 0.000001 8287 90665
2914 rs768983 SNP https://www.ncbi.nlm.nih.gov/snp/rs768983 2 23 2 208576 0.086957 0.000010 9068.521739 1.163118e-08 0.000001 8287 90665
2921 rs9341273 SNP https://www.ncbi.nlm.nih.gov/snp/rs9341273 2 23 2 208576 0.086957 0.000010 9068.521739 1.163118e-08 0.000001 8287 90665
2880 rs10399931 SNP https://www.ncbi.nlm.nih.gov/snp/rs10399931 1 23 1 208576 0.043478 0.000005 9068.521739 1.102716e-04 0.000848 1116
2910 rs6817952 SNP https://www.ncbi.nlm.nih.gov/snp/rs6817952 1 23 1 208576 0.043478 0.000005 9068.521739 1.102716e-04 0.000848 6373
In [54]:
df[df.term_type == 'Chemical'].head(5)
Out[54]:
term_ID term_type url study_count n_study population_count n_population gene_ratio background_ratio odd_ratio pvalue padj related_genes
311 CHEBI:33704 Chemical https://www.ebi.ac.uk/chebi/searchId.do?chebiI... 16 23 25109 208576 0.695652 0.120383 5.778659 2.050363e-10 5.446509e-08 8653 9086 7404 7544 83259 4267 1116 246777 673...
941 MESH:D000596 Chemical https://meshb.nlm.nih.gov/record/ui?ui=D000596 10 23 7876 208576 0.434783 0.037761 11.514121 4.264456e-09 7.329847e-07 8653 8287 9087 4267 1116 6736 1117 1654 8242 6373
2654 [missing]oryzomyne Chemical 2 23 2 208576 0.086957 0.000010 9068.521739 1.163118e-08 1.369294e-06 7544 7543
2658 [missing]p12----p11 Chemical 2 23 2 208576 0.086957 0.000010 9068.521739 1.163118e-08 1.369294e-06 7544 7543
347 CHEBI:50113 Chemical https://www.ebi.ac.uk/chebi/searchId.do?chebiI... 8 23 4699 208576 0.347826 0.022529 15.439067 2.390840e-08 2.253559e-06 8287 7544 83259 90665 4267 1116 6736 6373

ABC Model

In [35]:
ABC = ('Disease', 'Gene', 'Chemical')
ab_links = []
bc_links = []

for e in G.edges():
    types = (G.nodes[e[0]]['type'], G.nodes[e[1]]['type'])
    if types[0] == ABC[0] and types[1] == ABC[1]:
        ab_links.append(e)
    elif types[0] == ABC[1] and types[1] == ABC[2]:
        bc_links.append(e)
In [36]:
df_ab = pd.DataFrame(ab_links, columns=['a', 'b'])
df_bc = pd.DataFrame(bc_links, columns=['b', 'c'])
df_abc = pd.merge(df_ab, df_bc, on=['b'])
In [37]:
df_abc.shape[0]
Out[37]:
267077144
In [38]:
known_ac = set(e[0]+","+e[1] for e in G.edges)
df_abc['ac'] = df_abc.a + "," + df_abc.c
df_abc['ca'] = df_abc.c + "," + df_abc.a
df_abc['known'] = df_abc.ac.isin(known_ac) | df_abc.ca.isin(known_ac)
print(df_abc[~df_abc['known']].shape[0])
232834737
In [39]:
df_abc.head(10)
Out[39]:
a b c ac ca known
0 [missing]VHA 2838 CHEBI:41132 [missing]VHA,CHEBI:41132 CHEBI:41132,[missing]VHA False
1 [missing]VHA 2838 CHEBI:51707 [missing]VHA,CHEBI:51707 CHEBI:51707,[missing]VHA False
2 [missing]VHA 2838 CHEBI:60715 [missing]VHA,CHEBI:60715 CHEBI:60715,[missing]VHA False
3 [missing]VHA 2838 MESH:C007547 [missing]VHA,MESH:C007547 MESH:C007547,[missing]VHA False
4 [missing]VHA 2838 MESH:C025059 [missing]VHA,MESH:C025059 MESH:C025059,[missing]VHA False
5 [missing]VHA 2838 MESH:C042168 [missing]VHA,MESH:C042168 MESH:C042168,[missing]VHA False
6 [missing]VHA 2838 MESH:C102815 [missing]VHA,MESH:C102815 MESH:C102815,[missing]VHA False
7 [missing]VHA 2838 MESH:C467767 [missing]VHA,MESH:C467767 MESH:C467767,[missing]VHA False
8 [missing]VHA 2838 MESH:C542279 [missing]VHA,MESH:C542279 MESH:C542279,[missing]VHA False
9 [missing]VHA 2838 [missing](PMe2Ph)2PtB10H10-O,H-B10H11Pt(PMe2Ph [missing]VHA,[missing](PMe2Ph)2PtB10H10-O,H-B1... [missing](PMe2Ph)2PtB10H10-O,H-B10H11Pt(PMe2Ph... False