import networkx as nx
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, export_svgs, show
import sys
sys.path.insert(0, "../")
from pubnet.graph.nx import lsm_to_nx
output_notebook()
lsm_prefix = "../res/build_1/global_net_1904"
G = lsm_to_nx(lsm_prefix)
Number of nodes and edges:
len(G.nodes)
len(G.edges)
Different type nodes
from collections import Counter
c_nodes = Counter([G.nodes[n]['type'] for n in G.nodes])
c_nodes
from math import pi
from bokeh.palettes import Pastel1
from bokeh.plotting import figure
from bokeh.transform import cumsum
x = c_nodes
data = pd.Series(x).reset_index(name='value').rename(columns={'index':'type'})
data['angle'] = data['value']/data['value'].sum() * 2*pi
data['color'] = Pastel1[len(x)]
p = figure(plot_height=350, title="Pie Chart", toolbar_location=None,
tools="hover", tooltips="@type: @value", x_range=(-0.5, 1.0))
p.wedge(x=0, y=1, radius=0.4,
start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
line_color="white", fill_color='color', legend='type', source=data)
p.axis.axis_label=None
p.axis.visible=False
p.grid.grid_line_color = None
p.output_backend = "svg"
export_svgs(p, filename="node_types_pie.svg")
show(p)
Different type edges
c_edges = Counter([(G.nodes[e[0]]['type'], G.nodes[e[1]]['type']) for e in G.edges])
# merge same items
c_edges_ = {}
for k, v in c_edges.items():
k = tuple(sorted(k))
c_edges_.setdefault(k, 0)
c_edges_[k] += v
c_edges_
node_types = sorted(list(c_nodes.keys()))
m_ = np.zeros((len(node_types), len(node_types)))
edge_counts = pd.DataFrame(m_)
edge_counts.index = node_types
edge_counts.columns = node_types
for k, v in c_edges_.items():
edge_counts.loc[k[0], k[1]] = v
edge_counts.index.name = 'type1'
edge_counts.columns.name = 'type2'
from bokeh.models import LogColorMapper, BasicTicker, PrintfTickFormatter, ColorBar
from bokeh.plotting import figure
from bokeh.sampledata.unemployment1948 import data
data = edge_counts
node_types = list(data.index)
# reshape to 1D array or rates with a month and year for each row.
df = pd.DataFrame(data.stack(), columns=['count']).reset_index()
df = df[df!=0].dropna()
# this is the colormap from the original NYTimes plot
colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
mapper = LogColorMapper(palette=colors, low=df['count'].min(), high=df['count'].max())
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(title="Number of edges",
x_range=node_types, y_range=node_types,
x_axis_location="above", plot_width=700, plot_height=600,
tools=TOOLS, toolbar_location='below',
tooltips=[('type', '@type1 - @type2'), ('count', '@count')])
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
#p.axis.major_label_text_font_size = "5pt"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = pi / 3
p.rect(x="type1", y="type2", width=1, height=1,
source=df,
fill_color={'field': 'count', 'transform': mapper},
line_color=None)
color_bar = ColorBar(color_mapper=mapper,
ticker=BasicTicker(desired_num_ticks=len(colors)),
formatter=PrintfTickFormatter(format="%d"),
label_standoff=15, border_line_color=None, location=(0, 0))
p.add_layout(color_bar, 'right')
p.output_backend = "svg"
export_svgs(p, filename="edge_types_heatmap.svg")
show(p) # show the plot
Node degree distribution
nodes_degree = sorted([d for n, d in G.degree()], reverse=True)
hist, bin_edges = np.histogram(nodes_degree, bins=200)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(title="Node degree distribution",
tools=TOOLS,
background_fill_color="#fafafa",
plot_width=600, plot_height=300,
y_axis_type="log",
x_axis_type="log")
p.circle(hist, bin_edges[:-1], alpha=0.5, fill_color="navy", size=14)
p.y_range.start = 0
p.xaxis.axis_label = 'Degree'
p.yaxis.axis_label = 'Count'
p.grid.grid_line_color="white"
p.output_backend = "svg"
export_svgs(p, filename="node_degree_dist.svg")
show(p)
Edge strength and co-occurence distribution
edge_strength = []
edge_coo = []
for e in G.edges:
e_ = G.edges[e]
edge_strength.append(e_['strength'])
edge_coo.append(e_['coo'])
hist, bin_edges = np.histogram(edge_strength, bins=200)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(title="Edge strength distribution",
tools=TOOLS,
background_fill_color="#fafafa",
plot_width=600, plot_height=300,
y_axis_type="log",
x_axis_type="log")
p.circle(hist, bin_edges[:-1], alpha=0.5, fill_color="orange", size=14)
p.y_range.start = 0
p.xaxis.axis_label = 'Strength'
p.yaxis.axis_label = 'Count'
p.grid.grid_line_color="white"
p.output_backend = "svg"
export_svgs(p, filename="edge_strength_dist.svg")
show(p)
hist, bin_edges = np.histogram(edge_coo, bins=200)
TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"
p = figure(title="Edge co-occurence distribution",
tools=TOOLS,
background_fill_color="#fafafa",
plot_width=600, plot_height=300,
y_axis_type="log",
x_axis_type="log")
p.circle(hist, bin_edges[:-1], alpha=0.5, fill_color="blue", size=14)
p.y_range.start = 0
p.xaxis.axis_label = 'Co-occurence'
p.yaxis.axis_label = 'Count'
p.grid.grid_line_color="white"
p.output_backend = "svg"
export_svgs(p, filename="edge_coo_dist.svg")
show(p)
Example data: SAGD_00055
from pubnet.graph.bsea import BSEA, EntrezEnsemblConvert
input_csv = "../data/SAGD_00055.csv"
exp = pd.read_csv(input_csv)
exp.columns = ["ensembl_id"] + list(exp.columns)[1:]
sig = exp[exp.padj <= 0.05]
test_genes = list(sig.ensembl_id)
len(test_genes)
cvt = EntrezEnsemblConvert()
test_entrez = cvt.ensembl2entrez(test_genes)
bsea = BSEA(G, input_types=['Gene'], target_types='-')
genes = [str(i) for i in test_entrez]
bsea.enrich(genes)
bsea.multiple_test_corretion()
bsea.filter(by='padj', threshold=0.001)
df = bsea.enrichment_table
df.shape[0]
df.groupby('term_type').count()['term_ID']
df[df.term_type != 'Species'].head(10)
df[df.term_type == 'Disease'].head(5)
df[df.term_type == 'SNP'].head(5)
df[df.term_type == 'Chemical'].head(5)
ABC = ('Disease', 'Gene', 'Chemical')
ab_links = []
bc_links = []
for e in G.edges():
types = (G.nodes[e[0]]['type'], G.nodes[e[1]]['type'])
if types[0] == ABC[0] and types[1] == ABC[1]:
ab_links.append(e)
elif types[0] == ABC[1] and types[1] == ABC[2]:
bc_links.append(e)
df_ab = pd.DataFrame(ab_links, columns=['a', 'b'])
df_bc = pd.DataFrame(bc_links, columns=['b', 'c'])
df_abc = pd.merge(df_ab, df_bc, on=['b'])
df_abc.shape[0]
known_ac = set(e[0]+","+e[1] for e in G.edges)
df_abc['ac'] = df_abc.a + "," + df_abc.c
df_abc['ca'] = df_abc.c + "," + df_abc.a
df_abc['known'] = df_abc.ac.isin(known_ac) | df_abc.ca.isin(known_ac)
print(df_abc[~df_abc['known']].shape[0])
df_abc.head(10)