import networkx as nx
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, export_svgs, show

import sys
sys.path.insert(0, "../")

from pubnet.graph.nx import lsm_to_nx

output_notebook()

lsm_prefix = "../res/build_1/global_net_1904"

G = lsm_to_nx(lsm_prefix)

pubnet.graph.nx      INFO    @ 05/09/19 15:01:36: Counting pairs
pubnet.graph.nx      INFO    @ 05/09/19 15:02:48: 32425316 pairs found.
pubnet.graph.nx      INFO    @ 05/09/19 15:02:48: Load network into memory.
100%|██████████| 32425316/32425316 [33:55<00:00, 15931.65it/s]

Number of nodes and edges:

len(G.nodes)

3141000

len(G.edges)

32425316

Different type nodes

from collections import Counter

c_nodes = Counter([G.nodes[n]['type'] for n in G.nodes])

c_nodes

Counter({'Chemical': 2700653,
         'DNAMutation': 64116,
         'SNP': 43009,
         'ProteinMutation': 102420,
         'Disease': 11119,
         'Species': 11107,
         'Gene': 208576})

from math import pi

from bokeh.palettes import Pastel1
from bokeh.plotting import figure
from bokeh.transform import cumsum

x = c_nodes
data = pd.Series(x).reset_index(name='value').rename(columns={'index':'type'})
data['angle'] = data['value']/data['value'].sum() * 2*pi
data['color'] = Pastel1[len(x)]

p = figure(plot_height=350, title="Pie Chart", toolbar_location=None,
           tools="hover", tooltips="@type: @value", x_range=(-0.5, 1.0))

p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', legend='type', source=data)

p.axis.axis_label=None
p.axis.visible=False
p.grid.grid_line_color = None

p.output_backend = "svg"
export_svgs(p, filename="node_types_pie.svg")

show(p)

Different type edges

c_edges = Counter([(G.nodes[e[0]]['type'], G.nodes[e[1]]['type']) for e in G.edges])

# merge same items
c_edges_ = {}
for k, v in c_edges.items():
    k = tuple(sorted(k))
    c_edges_.setdefault(k, 0)
    c_edges_[k] += v

c_edges_

{('Chemical', 'Chemical'): 14506073,
 ('Chemical', 'DNAMutation'): 79409,
 ('Chemical', 'SNP'): 29004,
 ('Chemical', 'Gene'): 4073981,
 ('Chemical', 'Species'): 1980305,
 ('Chemical', 'Disease'): 3386139,
 ('Chemical', 'ProteinMutation'): 257445,
 ('DNAMutation', 'Gene'): 102469,
 ('DNAMutation', 'Species'): 33984,
 ('DNAMutation', 'Disease'): 70426,
 ('DNAMutation', 'SNP'): 32999,
 ('DNAMutation', 'DNAMutation'): 99822,
 ('DNAMutation', 'ProteinMutation'): 73021,
 ('Gene', 'SNP'): 107647,
 ('Disease', 'SNP'): 69901,
 ('ProteinMutation', 'Species'): 63857,
 ('Gene', 'ProteinMutation'): 136806,
 ('Disease', 'ProteinMutation'): 80341,
 ('ProteinMutation', 'ProteinMutation'): 199314,
 ('Disease', 'Gene'): 1893276,
 ('Disease', 'Species'): 382825,
 ('Disease', 'Disease'): 1488961,
 ('ProteinMutation', 'SNP'): 8748,
 ('SNP', 'SNP'): 113506,
 ('SNP', 'Species'): 20746,
 ('Gene', 'Species'): 596880,
 ('Species', 'Species'): 159924,
 ('Gene', 'Gene'): 2377507}

node_types = sorted(list(c_nodes.keys()))
m_ = np.zeros((len(node_types), len(node_types)))
edge_counts = pd.DataFrame(m_)
edge_counts.index = node_types
edge_counts.columns = node_types
for k, v in c_edges_.items():
    edge_counts.loc[k[0], k[1]] = v
edge_counts.index.name = 'type1'
edge_counts.columns.name = 'type2'

from bokeh.models import LogColorMapper, BasicTicker, PrintfTickFormatter, ColorBar
from bokeh.plotting import figure
from bokeh.sampledata.unemployment1948 import data

data = edge_counts

node_types = list(data.index)

# reshape to 1D array or rates with a month and year for each row.
df = pd.DataFrame(data.stack(), columns=['count']).reset_index()
df = df[df!=0].dropna()

# this is the colormap from the original NYTimes plot
colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
mapper = LogColorMapper(palette=colors, low=df['count'].min(), high=df['count'].max())

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="Number of edges",
           x_range=node_types, y_range=node_types,
           x_axis_location="above", plot_width=700, plot_height=600,
           tools=TOOLS, toolbar_location='below',
           tooltips=[('type', '@type1 - @type2'), ('count', '@count')])

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
#p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3

p.rect(x="type1", y="type2", width=1, height=1,
       source=df,
       fill_color={'field': 'count', 'transform': mapper},
       line_color=None)

color_bar = ColorBar(color_mapper=mapper,
                     ticker=BasicTicker(desired_num_ticks=len(colors)),
                     formatter=PrintfTickFormatter(format="%d"),
                     label_standoff=15, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')

p.output_backend = "svg"
export_svgs(p, filename="edge_types_heatmap.svg")


show(p)      # show the plot

Node degree distribution

nodes_degree = sorted([d for n, d in G.degree()], reverse=True)

hist, bin_edges = np.histogram(nodes_degree, bins=200)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="Node degree distribution",
           tools=TOOLS,
           background_fill_color="#fafafa",
           plot_width=600, plot_height=300,
           y_axis_type="log",
           x_axis_type="log")

p.circle(hist, bin_edges[:-1], alpha=0.5, fill_color="navy", size=14)

p.y_range.start = 0

p.xaxis.axis_label = 'Degree'
p.yaxis.axis_label = 'Count'
p.grid.grid_line_color="white"

p.output_backend = "svg"
export_svgs(p, filename="node_degree_dist.svg")

show(p)

Edge strength and co-occurence distribution

edge_strength = []
edge_coo = []

for e in G.edges:
    e_ = G.edges[e]
    edge_strength.append(e_['strength'])
    edge_coo.append(e_['coo'])

hist, bin_edges = np.histogram(edge_strength, bins=200)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="Edge strength distribution",
           tools=TOOLS,
           background_fill_color="#fafafa",
           plot_width=600, plot_height=300,
           y_axis_type="log",
           x_axis_type="log")

p.circle(hist, bin_edges[:-1], alpha=0.5, fill_color="orange", size=14)

p.y_range.start = 0

p.xaxis.axis_label = 'Strength'
p.yaxis.axis_label = 'Count'
p.grid.grid_line_color="white"

p.output_backend = "svg"
export_svgs(p, filename="edge_strength_dist.svg")

show(p)

hist, bin_edges = np.histogram(edge_coo, bins=200)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="Edge co-occurence distribution",
           tools=TOOLS,
           background_fill_color="#fafafa",
           plot_width=600, plot_height=300,
           y_axis_type="log",
           x_axis_type="log")

p.circle(hist, bin_edges[:-1], alpha=0.5, fill_color="blue", size=14)

p.y_range.start = 0

p.xaxis.axis_label = 'Co-occurence'
p.yaxis.axis_label = 'Count'
p.grid.grid_line_color="white"

p.output_backend = "svg"
export_svgs(p, filename="edge_coo_dist.svg")

show(p)

Bio-concept Set Enrichment Analysis¶

Example data: SAGD_00055

from pubnet.graph.bsea import BSEA, EntrezEnsemblConvert

input_csv = "../data/SAGD_00055.csv"
exp = pd.read_csv(input_csv)
exp.columns = ["ensembl_id"] + list(exp.columns)[1:]

sig = exp[exp.padj <= 0.05]
test_genes = list(sig.ensembl_id)
len(test_genes)

54

cvt = EntrezEnsemblConvert()
test_entrez = cvt.ensembl2entrez(test_genes)

pubnet.graph.bsea    INFO    @ 05/09/19 15:39:50: Perform gene ID conversion: from ensembl id to entrez id.

querying 1-54...done.

pubnet.graph.bsea    WARNING @ 05/09/19 15:40:01: 29 gene conversion failed, failed genes:
pubnet.graph.bsea    WARNING @ 05/09/19 15:40:01: ENSG00000131002,ENSG00000229807,ENSG00000233864,ENSG00000099725,ENSG00000176728,ENSG00000260197,ENSG00000206159,ENSG00000241859,ENSG00000278847,ENSG00000273906,ENSG00000227289,ENSG00000228764,ENSG00000232226,ENSG00000229236,ENSG00000225117,ENSG00000270641,ENSG00000267793,ENSG00000232348,ENSG00000224060,ENSG00000229163,ENSG00000215580,ENSG00000231535,ENSG00000227494,ENSG00000233070,ENSG00000228787,ENSG00000229238,ENSG00000130600,ENSG00000217896,ENSG00000261600

Finished.
2 input query terms found no hit:
	['ENSG00000233864', 'ENSG00000232348']
Pass "returnall=True" to return complete lists of duplicate or missing query terms.

bsea = BSEA(G, input_types=['Gene'], target_types='-')

genes = [str(i) for i in test_entrez]

bsea.enrich(genes)

pubnet.graph.bsea    INFO    @ 05/09/19 15:40:05: Perform gene set enrichment analyze.
pubnet.graph.bsea    INFO    @ 05/09/19 15:40:05: input conceptions: 25	valid conceptions: 23	possible terms: 2922
100%|██████████| 2922/2922 [00:18<00:00, 154.52it/s]
pubnet.graph.bsea    INFO    @ 05/09/19 15:40:24: Done

bsea.multiple_test_corretion()

pubnet.graph.bsea    INFO    @ 05/09/19 15:40:24: Multiple test correction with fdr_bh method.
pubnet.graph.bsea    INFO    @ 05/09/19 15:40:25: Done

bsea.filter(by='padj', threshold=0.001)

pubnet.graph.bsea    INFO    @ 05/09/19 15:59:20: filter the enrichment result with condition: padj <= 0.001

df = bsea.enrichment_table
df.shape[0]

383

df.groupby('term_type').count()['term_ID']

term_type
Chemical           161
DNAMutation         24
Disease            116
ProteinMutation     36
SNP                 15
Species             31
Name: term_ID, dtype: int64

df[df.term_type != 'Species'].head(10)

df[df.term_type == 'Disease'].head(5)

df[df.term_type == 'SNP'].head(5)

df[df.term_type == 'Chemical'].head(5)

ABC Model¶

ABC = ('Disease', 'Gene', 'Chemical')
ab_links = []
bc_links = []

for e in G.edges():
    types = (G.nodes[e[0]]['type'], G.nodes[e[1]]['type'])
    if types[0] == ABC[0] and types[1] == ABC[1]:
        ab_links.append(e)
    elif types[0] == ABC[1] and types[1] == ABC[2]:
        bc_links.append(e)

df_ab = pd.DataFrame(ab_links, columns=['a', 'b'])
df_bc = pd.DataFrame(bc_links, columns=['b', 'c'])
df_abc = pd.merge(df_ab, df_bc, on=['b'])

df_abc.shape[0]

267077144

known_ac = set(e[0]+","+e[1] for e in G.edges)
df_abc['ac'] = df_abc.a + "," + df_abc.c
df_abc['ca'] = df_abc.c + "," + df_abc.a
df_abc['known'] = df_abc.ac.isin(known_ac) | df_abc.ca.isin(known_ac)
print(df_abc[~df_abc['known']].shape[0])

232834737

df_abc.head(10)

	term_ID	term_type	url	study_count	n_study	population_count	n_population	gene_ratio	background_ratio	odd_ratio	pvalue	padj	related_genes
1565	MESH:D011471	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D011471	17	23	7000	208576	0.739130	0.033561	22.023553	7.102240e-21	5.188186e-18	8653 8287 22829 7404 7544 83259 9087 90665 426...
1469	MESH:D009369	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D009369	20	23	32654	208576	0.869565	0.156557	5.554310	8.499019e-14	4.966827e-11	8653 8287 9086 7404 7544 83259 9087 90665 4267...
1564	MESH:D011470	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D011470	8	23	1473	208576	0.347826	0.007062	49.251985	2.710133e-12	1.131287e-09	8653 8287 7404 7544 9087 1116 6736 7543
1383	MESH:D007938	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D007938	11	23	6664	208576	0.478261	0.031950	14.969048	3.328331e-11	1.215673e-08	8653 7404 4267 1116 6736 7543 9189 1654 7403 8...
311	CHEBI:33704	Chemical	https://www.ebi.ac.uk/chebi/searchId.do?chebiI...	16	23	25109	208576	0.695652	0.120383	5.778659	2.050363e-10	5.446509e-08	8653 9086 7404 7544 83259 4267 1116 246777 673...
1013	MESH:D001943	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D001943	12	23	11341	208576	0.521739	0.054373	9.595473	5.103941e-10	1.147209e-07	8287 9087 90665 4267 1116 6736 7543 1117 1654 ...
1460	MESH:D009223	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D009223	7	23	1802	208576	0.304348	0.008640	35.227332	7.715694e-10	1.610376e-07	8653 8287 7404 7544 9087 1116 6736
992	MESH:D001523	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D001523	10	23	7310	208576	0.434783	0.035047	12.405638	2.089974e-09	4.071269e-07	22829 7404 83259 4267 1116 6736 7543 1654 8242...
1374	MESH:D007713	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D007713	4	23	152	208576	0.173913	0.000729	238.645309	2.374418e-09	4.336280e-07	7544 4267 6736 7543
941	MESH:D000596	Chemical	https://meshb.nlm.nih.gov/record/ui?ui=D000596	10	23	7876	208576	0.434783	0.037761	11.514121	4.264456e-09	7.329847e-07	8653 8287 9087 4267 1116 6736 1117 1654 8242 6373

	term_ID	term_type	url	study_count	n_study	population_count	n_population	gene_ratio	background_ratio	odd_ratio	pvalue	padj	related_genes
1565	MESH:D011471	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D011471	17	23	7000	208576	0.739130	0.033561	22.023553	7.102240e-21	5.188186e-18	8653 8287 22829 7404 7544 83259 9087 90665 426...
1469	MESH:D009369	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D009369	20	23	32654	208576	0.869565	0.156557	5.554310	8.499019e-14	4.966827e-11	8653 8287 9086 7404 7544 83259 9087 90665 4267...
1564	MESH:D011470	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D011470	8	23	1473	208576	0.347826	0.007062	49.251985	2.710133e-12	1.131287e-09	8653 8287 7404 7544 9087 1116 6736 7543
1383	MESH:D007938	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D007938	11	23	6664	208576	0.478261	0.031950	14.969048	3.328331e-11	1.215673e-08	8653 7404 4267 1116 6736 7543 9189 1654 7403 8...
1013	MESH:D001943	Disease	https://meshb.nlm.nih.gov/record/ui?ui=D001943	12	23	11341	208576	0.521739	0.054373	9.595473	5.103941e-10	1.147209e-07	8287 9087 90665 4267 1116 6736 7543 1117 1654 ...

	term_ID	term_type	url	study_count	n_study	population_count	n_population	gene_ratio	background_ratio	odd_ratio	pvalue	padj	related_genes
2901	rs3212292	SNP	https://www.ncbi.nlm.nih.gov/snp/rs3212292	2	23	2	208576	0.086957	0.000010	9068.521739	1.163118e-08	0.000001	8287 90665
2914	rs768983	SNP	https://www.ncbi.nlm.nih.gov/snp/rs768983	2	23	2	208576	0.086957	0.000010	9068.521739	1.163118e-08	0.000001	8287 90665
2921	rs9341273	SNP	https://www.ncbi.nlm.nih.gov/snp/rs9341273	2	23	2	208576	0.086957	0.000010	9068.521739	1.163118e-08	0.000001	8287 90665
2880	rs10399931	SNP	https://www.ncbi.nlm.nih.gov/snp/rs10399931	1	23	1	208576	0.043478	0.000005	9068.521739	1.102716e-04	0.000848	1116
2910	rs6817952	SNP	https://www.ncbi.nlm.nih.gov/snp/rs6817952	1	23	1	208576	0.043478	0.000005	9068.521739	1.102716e-04	0.000848	6373

	term_ID	term_type	url	study_count	n_study	population_count	n_population	gene_ratio	background_ratio	odd_ratio	pvalue	padj	related_genes
311	CHEBI:33704	Chemical	https://www.ebi.ac.uk/chebi/searchId.do?chebiI...	16	23	25109	208576	0.695652	0.120383	5.778659	2.050363e-10	5.446509e-08	8653 9086 7404 7544 83259 4267 1116 246777 673...
941	MESH:D000596	Chemical	https://meshb.nlm.nih.gov/record/ui?ui=D000596	10	23	7876	208576	0.434783	0.037761	11.514121	4.264456e-09	7.329847e-07	8653 8287 9087 4267 1116 6736 1117 1654 8242 6373
2654	[missing]oryzomyne	Chemical		2	23	2	208576	0.086957	0.000010	9068.521739	1.163118e-08	1.369294e-06	7544 7543
2658	[missing]p12----p11	Chemical		2	23	2	208576	0.086957	0.000010	9068.521739	1.163118e-08	1.369294e-06	7544 7543
347	CHEBI:50113	Chemical	https://www.ebi.ac.uk/chebi/searchId.do?chebiI...	8	23	4699	208576	0.347826	0.022529	15.439067	2.390840e-08	2.253559e-06	8287 7544 83259 90665 4267 1116 6736 6373

	a	b	c	ac	ca	known
0	[missing]VHA	2838	CHEBI:41132	[missing]VHA,CHEBI:41132	CHEBI:41132,[missing]VHA	False
1	[missing]VHA	2838	CHEBI:51707	[missing]VHA,CHEBI:51707	CHEBI:51707,[missing]VHA	False
2	[missing]VHA	2838	CHEBI:60715	[missing]VHA,CHEBI:60715	CHEBI:60715,[missing]VHA	False
3	[missing]VHA	2838	MESH:C007547	[missing]VHA,MESH:C007547	MESH:C007547,[missing]VHA	False
4	[missing]VHA	2838	MESH:C025059	[missing]VHA,MESH:C025059	MESH:C025059,[missing]VHA	False
5	[missing]VHA	2838	MESH:C042168	[missing]VHA,MESH:C042168	MESH:C042168,[missing]VHA	False
6	[missing]VHA	2838	MESH:C102815	[missing]VHA,MESH:C102815	MESH:C102815,[missing]VHA	False
7	[missing]VHA	2838	MESH:C467767	[missing]VHA,MESH:C467767	MESH:C467767,[missing]VHA	False
8	[missing]VHA	2838	MESH:C542279	[missing]VHA,MESH:C542279	MESH:C542279,[missing]VHA	False
9	[missing]VHA	2838	[missing](PMe2Ph)2PtB10H10-O,H-B10H11Pt(PMe2Ph	[missing]VHA,[missing](PMe2Ph)2PtB10H10-O,H-B1...	[missing](PMe2Ph)2PtB10H10-O,H-B10H11Pt(PMe2Ph...	False