import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from deduplication import DeduplicatePathways
from math import pi
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.decomposition import PCA
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter, landscape
from io import BytesIO
import seaborn as sns
import plotly.graph_objects as go
from reportlab.pdfgen import canvas
from io import BytesIO
import seaborn as sns
import plotly.graph_objects as go
import plotly.io as pio
from PIL import Image
import numpy as np
class SavePlotsToPDF:
def __init__(self, filename):
self.figures = []
self.filename = filename
def add_figure(self, fig):
if type(fig) == go.Figure:
fig = self.plotly_to_matplotlib(fig)
buf = BytesIO()
fig.savefig(buf, format='png', bbox_inches='tight')
buf.seek(0)
img_data = np.array(Image.open(buf))
img = Image.fromarray(img_data)
self.figures.append(img)
def plotly_to_matplotlib(self, plotly_fig):
plotly_fig.write_image("temp.png")
img = Image.open("temp.png")
fig, ax = plt.subplots(figsize=(img.size[0]/80, img.size[1]/80), dpi=80)
ax.imshow(np.array(img))
ax.axis('off')
return fig
def export(self):
c = canvas.Canvas(self.filename, pagesize=landscape(letter))
for i, fig in enumerate(self.figures):
img_data = BytesIO()
fig.save(img_data, format='png')
img_data.seek(0)
c.drawInlineImage(img_data, 0, i * 500)
c.save()
print(f"PDF saved as {self.filename}")
def export_html(self):
# Save each figure as an individual HTML file, and then combine them
for i, fig in enumerate(self.figures):
pio.write_html(fig, f"figure{i}.html")
# Combine all the HTML files into a single HTML file
with open(f"{self.filename}.html", "w") as output_file:
for i in range(len(self.figures)):
with open(f"figure{i}.html") as input_file:
for line in input_file:
output_file.write(line)
print(f"HTML saved as {self.filename}.html")
PDF = SavePlotsToPDF("deduplication_analysis_report.pdf")
REPORT_TO_PDF = False
%%bash
# Indexing
echo "Indexing"
/usr/bin/time -v DBRetina index -a CPDB_human.tsv -o idx_CDBP &> /dev/null
# Pairwise
echo "Pairwise"
/usr/bin/time -v DBRetina pairwise -i idx_CDBP -t 16 &> /dev/null
# Community detection on ochiai 30
echo "Community detection"
/usr/bin/time -v DBRetina cluster -p idx_CDBP_DBRetina_pairwise.tsv -d ochiai -c 30 -o ochiai_30 --community &> /dev/null
Indexing Pairwise Community detection
dedup = DeduplicatePathways(
clusters_file="./ochiai_30_clusters.tsv",
associations_file="./CPDB_human.tsv",
index_prefix="idx_CDBP",
max_cont_cutoff=80,
exact_ochiai_cutoff=80,
GC = 100,
)
dedup.build_all("dedup_1")
Building all for dedup_1 Associations processed Gene to PSI built Pathway to PSI built Pathway to PPI built Pathway modularity computed with max_cont_cutoff 80 on idx_CDBP_DBRetina_pairwise.tsv Pathways metadata built Exact Ochiai matches removed Gene to PPI PSI exported at dedup_1_gene_to_ppi_psi.tsv Pathways metadata exported at dedup_1_pathways_metadata.tsv Deduplication in process ... Deduplication done. Results exported at dedup_1_clusters.tsv Logs exported at dedup_1_set_cover_logging.tsv Split pathways metadata exported at dedup_1_remaining_pathways_metadata.tsv and dedup_1_removed_pathways_metadata.tsv New associations exported at dedup_1_associations.tsv ------------------------- original number of pathways 3999 pathways removed due to exact duplication: 1303 = 32.583145786446615% number of pathways removed from set-cover only: 2066 = 51.662915728932234% final number of pathways 630 = 15.753938484621155% [DEBUG] total percentage of pathways: 100.0% original overlap score: 14.149696320114327 remaining overlap score: 4.744015719899965
%%bash
# Indexing
echo "Indexing"
/usr/bin/time -v DBRetina index -a dedup_1_associations.tsv -o idx_dedup_CDBP &> /dev/null
# Pairwise
echo "Pairwise"
/usr/bin/time -v DBRetina pairwise -i idx_dedup_CDBP -t 16 &> /dev/null
# Community detection on ochiai 30
echo "Community detection"
DBRetina cluster -p idx_dedup_CDBP_DBRetina_pairwise.tsv -d ochiai -c 30 -o dedup_ochiai_30 --community &> /dev/null
Indexing Pairwise Community detection
dedup = DeduplicatePathways(
clusters_file="./dedup_ochiai_30_clusters.tsv",
associations_file="./dedup_1_associations.tsv",
index_prefix="idx_dedup_CDBP",
max_cont_cutoff=80,
exact_ochiai_cutoff=80,
GC = 100,
)
dedup.build_all("dedup_2")
Building all for dedup_2 Associations processed Gene to PSI built Pathway to PSI built Pathway to PPI built Pathway modularity computed with max_cont_cutoff 80 on idx_dedup_CDBP_DBRetina_pairwise.tsv Pathways metadata built Exact Ochiai matches removed Gene to PPI PSI exported at dedup_2_gene_to_ppi_psi.tsv Pathways metadata exported at dedup_2_pathways_metadata.tsv Deduplication in process ... Deduplication done. Results exported at dedup_2_clusters.tsv Logs exported at dedup_2_set_cover_logging.tsv Split pathways metadata exported at dedup_2_remaining_pathways_metadata.tsv and dedup_2_removed_pathways_metadata.tsv New associations exported at dedup_2_associations.tsv ------------------------- original number of pathways 630 pathways removed due to exact duplication: 3 = 0.47619047619047616% number of pathways removed from set-cover only: 12 = 1.9047619047619047% final number of pathways 615 = 97.61904761904762% [DEBUG] total percentage of pathways: 100.0% original overlap score: 4.744015719899965 remaining overlap score: 4.744015719899965
# original data
df_origianl_all_metadata = pd.read_csv("./dedup_1_pathways_metadata.tsv", sep="\t")
# original data that successed to passed the deduplication
df_origianl_remaining_pathways = pd.read_csv("./dedup_1_remaining_pathways_metadata.tsv", sep="\t")
# original data that failed to passed the deduplication
df_origianl_removed_pathways = pd.read_csv("./dedup_1_removed_pathways_metadata.tsv", sep="\t")
# Pathways after deduplication and calculating new stats for them
df_deduplicated_all_metadata = pd.read_csv("./dedup_2_pathways_metadata.tsv", sep="\t")
import plotly.graph_objects as go
features = ['average_ppi', 'average_psi', 'fragmentation', 'heterogeneity', 'modularity']
for feature in features:
fig = go.Figure()
fig.add_trace(go.Box(y=df_origianl_remaining_pathways[feature], name='Remaining'))
fig.add_trace(go.Box(y=df_origianl_removed_pathways[feature], name='Removed'))
fig.add_trace(go.Box(y=df_deduplicated_all_metadata[feature], name='Deduplicated'))
fig.update_layout(title_text=f'Box Plot For {feature} Feature', height=600, width=800)
fig.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
# For each feature, create a paired plot
for feature in features:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_origianl_remaining_pathways[feature],
y=df_deduplicated_all_metadata[feature],
mode='markers',
name='Pathways'))
fig.update_layout(title_text=f'Comparison of {feature} before and after deduplication',
xaxis_title=f'{feature} (Remaining)',
yaxis_title=f'{feature} (Deduplicated)',
height=600, width=800)
# Add a diagonal line for reference
fig.add_shape(type='line', line=dict(dash='dash'),
x0=df_origianl_remaining_pathways[feature].min(), x1=df_origianl_remaining_pathways[feature].max(),
y0=df_deduplicated_all_metadata[feature].min(), y1=df_deduplicated_all_metadata[feature].max())
fig.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
import numpy as np
features = ['average_ppi', 'average_psi', 'fragmentation', 'heterogeneity', 'modularity']
# Apply log2 transformation (+1 is added to handle any zero values)
remaining_log = np.log2(abs(df_origianl_remaining_pathways[features]) + 1)
deduplicated_log = np.log2(abs(df_deduplicated_all_metadata[features]) + 1)
# Calculate means
remaining_means = remaining_log.mean()
deduplicated_means = deduplicated_log.mean()
fig = go.Figure()
# Create a radar plot for the means
fig.add_trace(go.Scatterpolar(
r=remaining_means,
theta=features,
fill='toself',
name='Remaining'
))
fig.add_trace(go.Scatterpolar(
r=deduplicated_means,
theta=features,
fill='toself',
name='Deduplicated'
))
fig.update_layout(
polar=dict(radialaxis=dict(visible=True)),
showlegend=True
)
fig.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
import seaborn as sns
import matplotlib.pyplot as plt
# Calculate correlations
correlations = df_origianl_remaining_pathways[features].corr()
# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlations, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of Features before Deduplication')
plt.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
# Calculate correlations
correlations = df_deduplicated_all_metadata[features].corr()
# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlations, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of Features after Deduplication')
plt.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
fig = px.scatter(df_origianl_remaining_pathways, x="average_ppi", y="modularity", size="no_of_genes", color="average_psi", hover_name="pathway", log_x=True, size_max=60)
fig.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
fig = px.scatter(df_deduplicated_all_metadata, x="average_psi", y="average_ppi", size="no_of_genes", color="modularity", hover_name="pathway", log_y=True, size_max=100)
fig.show()
import plotly.graph_objects as go
features = ['average_ppi', 'average_psi', 'fragmentation', 'heterogeneity', 'modularity']
# Calculate means
# remaining_means = np.log2(abs(df_origianl_remaining_pathways[features].median()) + 1)
# deduplicated_means = np.log2(abs(df_deduplicated_all_metadata[features].median()) + 1)
# removed_means = np.log2(abs(df_origianl_removed_pathways[features].median()) + 1)
remaining_means = abs(df_origianl_remaining_pathways[features].mean()) + 1
deduplicated_means = abs(df_deduplicated_all_metadata[features].mean()) + 1
removed_means = abs(df_origianl_removed_pathways[features].mean()) + 1
fig = go.Figure(data=[
go.Bar(name='Remaining', x=features, y=remaining_means, marker_color='darkblue'),
go.Bar(name='Deduplicated', x=features, y=deduplicated_means, marker_color = 'green'),
go.Bar(name='Removed', x=features, y=removed_means, marker_color = 'darkred')
])
# Change the bar mode
fig.update_layout(barmode='group', title='Comparison features for same pathways before/after deduplication', yaxis_title='Log Median Value')
fig.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
import plotly.express as px
# Combine the two dataframes and add a 'category' column
df_origianl_remaining_pathways['category'] = 0 # 'Remaining'
df_deduplicated_all_metadata['category'] = 100 # 'Deduplicated'
df_origianl_removed_pathways['category'] = 500 # 'Removed'
df_combined = pd.concat([df_origianl_remaining_pathways, df_deduplicated_all_metadata, df_origianl_removed_pathways])
# Normalize data (optional)
features = ['average_ppi', 'average_psi', 'fragmentation', 'heterogeneity', 'modularity']
df_combined[features] = (df_combined[features] - df_combined[features].min()) / (df_combined[features].max() - df_combined[features].min())
# Create the plot
fig = px.parallel_coordinates(df_combined, color='category', dimensions=features,
color_continuous_scale=px.colors.sequential.Plasma, labels={'category':'Category'})
fig.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
import plotly.express as px
fig = px.parallel_coordinates(df_origianl_remaining_pathways, color='modularity',
title='final pathways before deduplication',
labels={"PPI": "Pathway Pleiotropy Index",
"PSI": "Pathway Specificity Index",
"fragmentation": "Fragmentation",
"heterogeneity": "Heterogeneity",
"modularity": "Modularity",
"length": "Length"},
color_continuous_scale=px.colors.diverging.Tealrose,
color_continuous_midpoint=2)
fig.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
#--------------------------
fig = px.parallel_coordinates(df_origianl_removed_pathways, color='modularity',
title='Removed Pathways',
labels={"PPI": "Pathway Pleiotropy Index",
"PSI": "Pathway Specificity Index",
"fragmentation": "Fragmentation",
"heterogeneity": "Heterogeneity",
"modularity": "Modularity",
"length": "Length"},
color_continuous_scale=px.colors.diverging.Tealrose,
color_continuous_midpoint=2)
fig.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
#--------------------------
import plotly.express as px
fig = px.parallel_coordinates(df_deduplicated_all_metadata, color='modularity',
title='final pathways after deduplication',
labels={"PPI": "Pathway Pleiotropy Index",
"PSI": "Pathway Specificity Index",
"fragmentation": "Fragmentation",
"heterogeneity": "Heterogeneity",
"modularity": "Modularity",
"length": "Length"},
color_continuous_scale=px.colors.diverging.Tealrose,
color_continuous_midpoint=2)
fig.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
scaler = StandardScaler()
original_std = scaler.fit_transform(df_origianl_all_metadata[features])
deduplicated_std = scaler.transform(df_deduplicated_all_metadata[features])
missing_std = scaler.transform(df_origianl_removed_pathways[features])
mean_original = np.mean(original_std, axis=0).tolist()
mean_deduplicated = np.mean(deduplicated_std, axis=0).tolist()
mean_missing = np.mean(missing_std, axis=0).tolist()
num_vars = len(features)
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
# Add original
ax.fill(angles, mean_original+mean_original[:1], color='blue', alpha=0.25)
ax.plot(angles, mean_original+mean_original[:1], color='blue', linewidth=1, label='Original')
# Add deduplicated
ax.fill(angles, mean_deduplicated+mean_deduplicated[:1], color='green', alpha=0.25)
ax.plot(angles, mean_deduplicated+mean_deduplicated[:1], color='green', linewidth=1, label='Deduplicated')
# Add missing
ax.fill(angles, mean_missing+mean_missing[:1], color='red', alpha=0.25)
ax.plot(angles, mean_missing+mean_missing[:1], color='red', linewidth=1, label='Missing')
ax.set_thetagrids(np.degrees(angles[:-1]), features)
ax.set_rlabel_position(180 / num_vars)
plt.title('Mean Feature Comparison: Original vs Deduplicated vs Missing', size=20, color='black', y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))
plt.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
for feature in features:
plt.figure(figsize=(18,6))
plt.subplot(131) # 1 row, 3 columns, subplot 1
sns.histplot(df_origianl_remaining_pathways[feature], color='blue', kde=True)
plt.title('Remaining')
plt.xlabel(feature)
plt.ylabel('Frequency')
plt.subplot(132) # 1 row, 3 columns, subplot 2
sns.histplot(df_origianl_removed_pathways[feature], color='green', kde=True)
plt.title('Removed')
plt.xlabel(feature)
plt.subplot(133) # 1 row, 3 columns, subplot 3
sns.histplot(df_deduplicated_all_metadata[feature], color='red', kde=True)
plt.title('Deduplicated')
plt.xlabel(feature)
plt.tight_layout() # to ensure proper spacing between subplots
plt.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
# normalize the data
scaler = StandardScaler()
original_std = scaler.fit_transform(df_origianl_remaining_pathways[features])
deduplicated_std = scaler.fit_transform(df_deduplicated_all_metadata[features])
removed_std = scaler.fit_transform(df_origianl_removed_pathways[features])
# calculate the mean for each dataset
mean_original = np.mean(original_std, axis=0).tolist()
mean_deduplicated = np.mean(deduplicated_std, axis=0).tolist()
mean_removed = np.mean(removed_std, axis=0).tolist()
# number of variables/features
num_vars = len(features)
# compute angle of each axis
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
# ensure that the plot is a complete circle
angles += angles[:1]
# initialize the spider plot
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
# plot data and fill with color for original dataset
ax.fill(angles, mean_original+mean_original[:1], 'blue', alpha=0.1)
ax.plot(angles, mean_original+mean_original[:1], linewidth=2, linestyle='solid', label='Remaining')
# plot data and fill with color for deduplicated dataset
ax.fill(angles, mean_deduplicated+mean_deduplicated[:1], 'green', alpha=0.1)
ax.plot(angles, mean_deduplicated+mean_deduplicated[:1], linewidth=2, linestyle='solid', label='Deduplicated')
# plot data and fill with color for removed dataset
ax.fill(angles, mean_removed+mean_removed[:1], 'red', alpha=0.1)
ax.plot(angles, mean_removed+mean_removed[:1], linewidth=2, linestyle='solid', label='Removed')
# add legend and title
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
plt.title('Mean feature comparison: Original vs Deduplicated')
# set the number of ticks based on the number of features (6 in this case)
plt.xticks(angles[:-1], features, color='grey', size=8)
# Draw ylabels
ax.set_rlabel_position(0)
plt.yticks(color="grey", size=7)
plt.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
# original dataset
sns.pairplot(df_origianl_remaining_pathways[features])
plt.suptitle('Pairplot - Remaining Pathways')
plt.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
# deduplicated dataset
sns.pairplot(df_deduplicated_all_metadata[features])
plt.suptitle('Pairplot - Deduplicated Dataset')
plt.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
# plot scatter plots for pairs of numerical features
for i in range(len(features)):
for j in range(i+1, len(features)):
plt.figure(figsize=(10,6))
plt.scatter(df_origianl_remaining_pathways[features[i]], df_origianl_remaining_pathways[features[j]], alpha=0.5, label='Remaining', color='blue')
plt.scatter(df_origianl_removed_pathways[features[i]], df_origianl_removed_pathways[features[j]], alpha=0.5, label='Removed', color='red')
plt.scatter(df_deduplicated_all_metadata[features[i]], df_deduplicated_all_metadata[features[j]], alpha=0.5, label='Deduplicated', color='green')
plt.title(f'{features[i]} vs {features[j]}')
plt.xlabel(features[i])
plt.ylabel(features[j])
plt.legend(loc='upper right')
plt.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
for feature in features:
# Create a Figure
fig = go.Figure()
# Add the 'Kept' histogram
fig.add_trace(go.Histogram(
x=df_origianl_remaining_pathways[feature],
#histnorm='percent',
name='Remaining',
marker_color='#EB89B5',
nbinsx=20 # specify the number of bins
))
# Add the 'Missing' histogram
fig.add_trace(go.Histogram(
x=df_origianl_removed_pathways[feature],
#histnorm='percent',
name='Removed',
marker_color='#330C73',
nbinsx=20 # specify the number of bins
))
# Add the 'Deduplicated' histogram
fig.add_trace(go.Histogram(
x=df_deduplicated_all_metadata[feature],
#histnorm='percent',
name='Deduplicated',
marker_color='green',
nbinsx=20 # specify the number of bins
))
# Finalize layout
fig.update_layout(
title_text=f'Distribution of {feature}', # title of plot
xaxis_title_text=f'{feature}', # xaxis label
yaxis_title_text='Count', # yaxis label
bargap=0.2, # gap between bars of adjacent location coordinates
bargroupgap=0.1, # gap between bars of the same location coordinates
barmode='group', # bars are placed next to each other
autosize=False,
width=800,
height=500
)
# log x-axis
# fig.update_yaxes(type="log")
fig.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
for feature in features:
# Extract the feature values for the kept, missing and deduplicated datasets
kept_values = df_origianl_remaining_pathways[feature].mean()
missing_values = df_origianl_removed_pathways[feature].mean()
deduplicated_values = df_deduplicated_all_metadata[feature].mean()
# Convert the means to a DataFrame for plotly
df_means = pd.DataFrame(dict(
r=[kept_values, missing_values, deduplicated_values],
theta=['Remaining', 'Removed', 'Deduplicated']
))
# Create radar plots
fig = go.Figure()
# Add traces for Kept, Missing and Deduplicated datasets
fig.add_trace(go.Scatterpolar(
name=feature,
r=df_means['r'],
theta=df_means['theta'],
fill='toself'
))
# Finalize layout
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, df_means['r'].max()]
)
),
showlegend=False,
title=f"Spider Chart for {feature}: Kept vs Missing vs Deduplicated Pathways"
)
fig.show()
if REPORT_TO_PDF:
PDF.add_figure(fig)
if REPORT_TO_PDF:
PDF.export()