In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from deduplication import DeduplicatePathways
from math import pi
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.decomposition import PCA
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter, landscape
from io import BytesIO
import seaborn as sns
import plotly.graph_objects as go
from reportlab.pdfgen import canvas
from io import BytesIO
import seaborn as sns
import plotly.graph_objects as go
import plotly.io as pio
from PIL import Image
import numpy as np
In [ ]:
class SavePlotsToPDF:
    def __init__(self, filename):
        self.figures = []
        self.filename = filename

    def add_figure(self, fig):
        if type(fig) == go.Figure:
            fig = self.plotly_to_matplotlib(fig)
        buf = BytesIO()
        fig.savefig(buf, format='png', bbox_inches='tight')
        buf.seek(0)
        img_data = np.array(Image.open(buf))
        img = Image.fromarray(img_data)
        self.figures.append(img)

    def plotly_to_matplotlib(self, plotly_fig):
        plotly_fig.write_image("temp.png")
        img = Image.open("temp.png")
        fig, ax = plt.subplots(figsize=(img.size[0]/80, img.size[1]/80), dpi=80)
        ax.imshow(np.array(img))
        ax.axis('off')
        return fig

    def export(self):
        c = canvas.Canvas(self.filename, pagesize=landscape(letter))

        for i, fig in enumerate(self.figures):
            img_data = BytesIO()
            fig.save(img_data, format='png')
            img_data.seek(0)
            c.drawInlineImage(img_data, 0, i * 500)

        c.save()
        print(f"PDF saved as {self.filename}")

    def export_html(self):
        # Save each figure as an individual HTML file, and then combine them
        for i, fig in enumerate(self.figures):
            pio.write_html(fig, f"figure{i}.html")

        # Combine all the HTML files into a single HTML file
        with open(f"{self.filename}.html", "w") as output_file:
            for i in range(len(self.figures)):
                with open(f"figure{i}.html") as input_file:
                    for line in input_file:
                        output_file.write(line)

        print(f"HTML saved as {self.filename}.html")
In [ ]:
PDF = SavePlotsToPDF("deduplication_analysis_report.pdf")
REPORT_TO_PDF = False
In [ ]:
%%bash
# Indexing
echo "Indexing"
/usr/bin/time -v DBRetina index -a CPDB_human.tsv -o idx_CDBP &> /dev/null

# Pairwise
echo "Pairwise"
/usr/bin/time -v DBRetina pairwise -i idx_CDBP -t 16 &> /dev/null

# Community detection on ochiai 30
echo "Community detection"
/usr/bin/time -v DBRetina cluster -p idx_CDBP_DBRetina_pairwise.tsv -d ochiai -c 30 -o ochiai_30 --community &> /dev/null
Indexing
Pairwise
Community detection

Deduplication¶

In [ ]:
dedup = DeduplicatePathways(
        clusters_file="./ochiai_30_clusters.tsv", 
        associations_file="./CPDB_human.tsv", 
        index_prefix="idx_CDBP",
        max_cont_cutoff=80,
        exact_ochiai_cutoff=80,
        GC = 100,
        )

dedup.build_all("dedup_1")
Building all for dedup_1
Associations processed
Gene to PSI built
Pathway to PSI built
Pathway to PPI built
Pathway modularity computed with max_cont_cutoff 80 on idx_CDBP_DBRetina_pairwise.tsv
Pathways metadata built
Exact Ochiai matches removed
Gene to PPI PSI exported at dedup_1_gene_to_ppi_psi.tsv
Pathways metadata exported at dedup_1_pathways_metadata.tsv
Deduplication in process ...
Deduplication done. Results exported at dedup_1_clusters.tsv
Logs exported at dedup_1_set_cover_logging.tsv
Split pathways metadata exported at dedup_1_remaining_pathways_metadata.tsv and dedup_1_removed_pathways_metadata.tsv
New associations exported at dedup_1_associations.tsv
-------------------------
original number of pathways 3999
pathways removed due to exact duplication: 1303 = 32.583145786446615%
number of pathways removed from set-cover only: 2066 = 51.662915728932234%
final number of pathways 630 = 15.753938484621155%
[DEBUG] total percentage of pathways: 100.0%
original overlap score: 14.149696320114327
remaining overlap score: 4.744015719899965

After deduplication Index/Pairwise/Ochiai_30¶

In [ ]:
%%bash
# Indexing
echo "Indexing"
/usr/bin/time -v DBRetina index -a dedup_1_associations.tsv -o idx_dedup_CDBP &> /dev/null

# Pairwise
echo "Pairwise"
/usr/bin/time -v DBRetina pairwise -i idx_dedup_CDBP -t 16 &> /dev/null

# Community detection on ochiai 30
echo "Community detection"
DBRetina cluster -p idx_dedup_CDBP_DBRetina_pairwise.tsv -d ochiai -c 30 -o dedup_ochiai_30 --community &> /dev/null
Indexing
Pairwise
Community detection
In [ ]:
dedup = DeduplicatePathways(
        clusters_file="./dedup_ochiai_30_clusters.tsv", 
        associations_file="./dedup_1_associations.tsv", 
        index_prefix="idx_dedup_CDBP",
        max_cont_cutoff=80,
        exact_ochiai_cutoff=80,
        GC = 100,
        )

dedup.build_all("dedup_2")
Building all for dedup_2
Associations processed
Gene to PSI built
Pathway to PSI built
Pathway to PPI built
Pathway modularity computed with max_cont_cutoff 80 on idx_dedup_CDBP_DBRetina_pairwise.tsv
Pathways metadata built
Exact Ochiai matches removed
Gene to PPI PSI exported at dedup_2_gene_to_ppi_psi.tsv
Pathways metadata exported at dedup_2_pathways_metadata.tsv
Deduplication in process ...
Deduplication done. Results exported at dedup_2_clusters.tsv
Logs exported at dedup_2_set_cover_logging.tsv
Split pathways metadata exported at dedup_2_remaining_pathways_metadata.tsv and dedup_2_removed_pathways_metadata.tsv
New associations exported at dedup_2_associations.tsv
-------------------------
original number of pathways 630
pathways removed due to exact duplication: 3 = 0.47619047619047616%
number of pathways removed from set-cover only: 12 = 1.9047619047619047%
final number of pathways 615 = 97.61904761904762%
[DEBUG] total percentage of pathways: 100.0%
original overlap score: 4.744015719899965
remaining overlap score: 4.744015719899965

Visualizations¶

In [ ]:
# original data
df_origianl_all_metadata = pd.read_csv("./dedup_1_pathways_metadata.tsv", sep="\t")
# original data that successed to passed the deduplication
df_origianl_remaining_pathways = pd.read_csv("./dedup_1_remaining_pathways_metadata.tsv", sep="\t")
# original data that failed to passed the deduplication
df_origianl_removed_pathways = pd.read_csv("./dedup_1_removed_pathways_metadata.tsv", sep="\t")
# Pathways after deduplication and calculating new stats for them
df_deduplicated_all_metadata = pd.read_csv("./dedup_2_pathways_metadata.tsv", sep="\t")
In [ ]:
import plotly.graph_objects as go

features = ['average_ppi', 'average_psi', 'fragmentation', 'heterogeneity', 'modularity']

for feature in features:
    fig = go.Figure()

    fig.add_trace(go.Box(y=df_origianl_remaining_pathways[feature], name='Remaining'))
    fig.add_trace(go.Box(y=df_origianl_removed_pathways[feature], name='Removed'))
    fig.add_trace(go.Box(y=df_deduplicated_all_metadata[feature], name='Deduplicated'))

    fig.update_layout(title_text=f'Box Plot For {feature} Feature', height=600, width=800)
    fig.show()
    
    if REPORT_TO_PDF:
        PDF.add_figure(fig)

pair plots for features before and after deduplication¶

In [ ]:
# For each feature, create a paired plot
for feature in features:
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=df_origianl_remaining_pathways[feature],
                             y=df_deduplicated_all_metadata[feature],
                             mode='markers',
                             name='Pathways'))

    fig.update_layout(title_text=f'Comparison of {feature} before and after deduplication',
                      xaxis_title=f'{feature} (Remaining)',
                      yaxis_title=f'{feature} (Deduplicated)',
                      height=600, width=800)

    # Add a diagonal line for reference
    fig.add_shape(type='line', line=dict(dash='dash'),
                  x0=df_origianl_remaining_pathways[feature].min(), x1=df_origianl_remaining_pathways[feature].max(),
                  y0=df_deduplicated_all_metadata[feature].min(), y1=df_deduplicated_all_metadata[feature].max())

    fig.show()
    if REPORT_TO_PDF:
        PDF.add_figure(fig)

Radar (Spider) Plots¶

In [ ]:
import numpy as np

features = ['average_ppi', 'average_psi', 'fragmentation', 'heterogeneity', 'modularity']

# Apply log2 transformation (+1 is added to handle any zero values)
remaining_log = np.log2(abs(df_origianl_remaining_pathways[features]) + 1)
deduplicated_log = np.log2(abs(df_deduplicated_all_metadata[features]) + 1)

# Calculate means
remaining_means = remaining_log.mean()
deduplicated_means = deduplicated_log.mean()

fig = go.Figure()

# Create a radar plot for the means
fig.add_trace(go.Scatterpolar(
    r=remaining_means,
    theta=features,
    fill='toself',
    name='Remaining'
))
fig.add_trace(go.Scatterpolar(
    r=deduplicated_means,
    theta=features,
    fill='toself',
    name='Deduplicated'
))

fig.update_layout(
    polar=dict(radialaxis=dict(visible=True)),
    showlegend=True
)

fig.show()
if REPORT_TO_PDF:
    PDF.add_figure(fig)

Features correlation¶

In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate correlations
correlations = df_origianl_remaining_pathways[features].corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlations, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of Features before Deduplication')
plt.show()

if REPORT_TO_PDF:
    PDF.add_figure(fig)

# Calculate correlations
correlations = df_deduplicated_all_metadata[features].corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlations, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of Features after Deduplication')
plt.show()
if REPORT_TO_PDF:
    PDF.add_figure(fig)

Bubble plots¶

In [ ]:
fig = px.scatter(df_origianl_remaining_pathways, x="average_ppi", y="modularity", size="no_of_genes", color="average_psi", hover_name="pathway", log_x=True, size_max=60)
fig.show()

if REPORT_TO_PDF:
    PDF.add_figure(fig)
In [ ]:
fig = px.scatter(df_deduplicated_all_metadata, x="average_psi", y="average_ppi", size="no_of_genes", color="modularity", hover_name="pathway", log_y=True, size_max=100)
fig.show()

Bar plots¶

In [ ]:
import plotly.graph_objects as go

features = ['average_ppi', 'average_psi', 'fragmentation', 'heterogeneity', 'modularity']

# Calculate means
# remaining_means = np.log2(abs(df_origianl_remaining_pathways[features].median()) + 1)
# deduplicated_means = np.log2(abs(df_deduplicated_all_metadata[features].median()) + 1)
# removed_means = np.log2(abs(df_origianl_removed_pathways[features].median()) + 1)

remaining_means = abs(df_origianl_remaining_pathways[features].mean()) + 1
deduplicated_means = abs(df_deduplicated_all_metadata[features].mean()) + 1
removed_means = abs(df_origianl_removed_pathways[features].mean()) + 1


fig = go.Figure(data=[
    go.Bar(name='Remaining', x=features, y=remaining_means, marker_color='darkblue'),
    go.Bar(name='Deduplicated', x=features, y=deduplicated_means, marker_color = 'green'),
    go.Bar(name='Removed', x=features, y=removed_means, marker_color = 'darkred')
])

# Change the bar mode
fig.update_layout(barmode='group', title='Comparison features for same pathways before/after deduplication', yaxis_title='Log Median Value')
fig.show()

if REPORT_TO_PDF:
    PDF.add_figure(fig)

Parllel plots across dataframes¶

In [ ]:
import plotly.express as px

# Combine the two dataframes and add a 'category' column
df_origianl_remaining_pathways['category'] = 0  # 'Remaining'
df_deduplicated_all_metadata['category'] = 100  # 'Deduplicated'
df_origianl_removed_pathways['category'] = 500  # 'Removed'
df_combined = pd.concat([df_origianl_remaining_pathways, df_deduplicated_all_metadata, df_origianl_removed_pathways])

# Normalize data (optional)
features = ['average_ppi', 'average_psi', 'fragmentation', 'heterogeneity', 'modularity']
df_combined[features] = (df_combined[features] - df_combined[features].min()) / (df_combined[features].max() - df_combined[features].min())

# Create the plot
fig = px.parallel_coordinates(df_combined, color='category', dimensions=features, 
                              color_continuous_scale=px.colors.sequential.Plasma, labels={'category':'Category'})

fig.show()

if REPORT_TO_PDF:
    PDF.add_figure(fig)
In [ ]:
 

Parallel Plots¶

In [ ]:
import plotly.express as px

fig = px.parallel_coordinates(df_origianl_remaining_pathways, color='modularity',
                              title='final pathways before deduplication',
                              
                              labels={"PPI": "Pathway Pleiotropy Index",
                                      "PSI": "Pathway Specificity Index",
                                      "fragmentation": "Fragmentation",
                                      "heterogeneity": "Heterogeneity",
                                      "modularity": "Modularity",
                                      "length": "Length"},
                              color_continuous_scale=px.colors.diverging.Tealrose,
                              color_continuous_midpoint=2)
fig.show()

if REPORT_TO_PDF:
    PDF.add_figure(fig)
    
#--------------------------


fig = px.parallel_coordinates(df_origianl_removed_pathways, color='modularity',
                              title='Removed Pathways',
                              
                              labels={"PPI": "Pathway Pleiotropy Index",
                                      "PSI": "Pathway Specificity Index",
                                      "fragmentation": "Fragmentation",
                                      "heterogeneity": "Heterogeneity",
                                      "modularity": "Modularity",
                                      "length": "Length"},
                              color_continuous_scale=px.colors.diverging.Tealrose,
                              color_continuous_midpoint=2)
fig.show()

if REPORT_TO_PDF:
    PDF.add_figure(fig)

#--------------------------



import plotly.express as px

fig = px.parallel_coordinates(df_deduplicated_all_metadata, color='modularity',
                              title='final pathways after deduplication',
                              labels={"PPI": "Pathway Pleiotropy Index",
                                      "PSI": "Pathway Specificity Index",
                                      "fragmentation": "Fragmentation",
                                      "heterogeneity": "Heterogeneity",
                                      "modularity": "Modularity",
                                      "length": "Length"},
                              color_continuous_scale=px.colors.diverging.Tealrose,
                              color_continuous_midpoint=2)
fig.show()

if REPORT_TO_PDF:
    PDF.add_figure(fig)

In [ ]:
scaler = StandardScaler()

original_std = scaler.fit_transform(df_origianl_all_metadata[features])
deduplicated_std = scaler.transform(df_deduplicated_all_metadata[features])
missing_std = scaler.transform(df_origianl_removed_pathways[features])

mean_original = np.mean(original_std, axis=0).tolist()
mean_deduplicated = np.mean(deduplicated_std, axis=0).tolist()
mean_missing = np.mean(missing_std, axis=0).tolist()

num_vars = len(features)

angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))

# Add original
ax.fill(angles, mean_original+mean_original[:1], color='blue', alpha=0.25)
ax.plot(angles, mean_original+mean_original[:1], color='blue', linewidth=1, label='Original')

# Add deduplicated
ax.fill(angles, mean_deduplicated+mean_deduplicated[:1], color='green', alpha=0.25)
ax.plot(angles, mean_deduplicated+mean_deduplicated[:1], color='green', linewidth=1, label='Deduplicated')

# Add missing
ax.fill(angles, mean_missing+mean_missing[:1], color='red', alpha=0.25)
ax.plot(angles, mean_missing+mean_missing[:1], color='red', linewidth=1, label='Missing')

ax.set_thetagrids(np.degrees(angles[:-1]), features)
ax.set_rlabel_position(180 / num_vars)

plt.title('Mean Feature Comparison: Original vs Deduplicated vs Missing', size=20, color='black', y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))
plt.show()

if REPORT_TO_PDF:
    PDF.add_figure(fig)
In [ ]:
 
In [ ]:
for feature in features:
    plt.figure(figsize=(18,6))
    
    plt.subplot(131) # 1 row, 3 columns, subplot 1
    sns.histplot(df_origianl_remaining_pathways[feature], color='blue', kde=True)
    plt.title('Remaining')
    plt.xlabel(feature)
    plt.ylabel('Frequency')

    plt.subplot(132) # 1 row, 3 columns, subplot 2
    sns.histplot(df_origianl_removed_pathways[feature], color='green', kde=True)
    plt.title('Removed')
    plt.xlabel(feature)

    plt.subplot(133) # 1 row, 3 columns, subplot 3
    sns.histplot(df_deduplicated_all_metadata[feature], color='red', kde=True)
    plt.title('Deduplicated')
    plt.xlabel(feature)
    
    plt.tight_layout() # to ensure proper spacing between subplots
    plt.show()
    
    if REPORT_TO_PDF:
        PDF.add_figure(fig)
In [ ]:
 
In [ ]:
# normalize the data
scaler = StandardScaler()

original_std = scaler.fit_transform(df_origianl_remaining_pathways[features])
deduplicated_std = scaler.fit_transform(df_deduplicated_all_metadata[features])
removed_std = scaler.fit_transform(df_origianl_removed_pathways[features])

# calculate the mean for each dataset
mean_original = np.mean(original_std, axis=0).tolist()
mean_deduplicated = np.mean(deduplicated_std, axis=0).tolist()
mean_removed = np.mean(removed_std, axis=0).tolist()

# number of variables/features
num_vars = len(features)

# compute angle of each axis
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()

# ensure that the plot is a complete circle
angles += angles[:1]

# initialize the spider plot
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))

# plot data and fill with color for original dataset
ax.fill(angles, mean_original+mean_original[:1], 'blue', alpha=0.1)
ax.plot(angles, mean_original+mean_original[:1], linewidth=2, linestyle='solid', label='Remaining')

# plot data and fill with color for deduplicated dataset
ax.fill(angles, mean_deduplicated+mean_deduplicated[:1], 'green', alpha=0.1)
ax.plot(angles, mean_deduplicated+mean_deduplicated[:1], linewidth=2, linestyle='solid', label='Deduplicated')

# plot data and fill with color for removed dataset
ax.fill(angles, mean_removed+mean_removed[:1], 'red', alpha=0.1)
ax.plot(angles, mean_removed+mean_removed[:1], linewidth=2, linestyle='solid', label='Removed')

# add legend and title
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
plt.title('Mean feature comparison: Original vs Deduplicated')

# set the number of ticks based on the number of features (6 in this case)
plt.xticks(angles[:-1], features, color='grey', size=8)

# Draw ylabels
ax.set_rlabel_position(0)
plt.yticks(color="grey", size=7)
plt.show()

if REPORT_TO_PDF:
    PDF.add_figure(fig)
In [ ]:
 
In [ ]:
 
In [ ]:
# original dataset
sns.pairplot(df_origianl_remaining_pathways[features])
plt.suptitle('Pairplot - Remaining Pathways')
plt.show()
if REPORT_TO_PDF:
    PDF.add_figure(fig)

# deduplicated dataset
sns.pairplot(df_deduplicated_all_metadata[features])
plt.suptitle('Pairplot - Deduplicated Dataset')
plt.show()

if REPORT_TO_PDF:
    PDF.add_figure(fig)
In [ ]:
 
In [ ]:
 
In [ ]:
# plot scatter plots for pairs of numerical features
for i in range(len(features)):
    for j in range(i+1, len(features)):
        plt.figure(figsize=(10,6))
        plt.scatter(df_origianl_remaining_pathways[features[i]], df_origianl_remaining_pathways[features[j]], alpha=0.5, label='Remaining', color='blue')
        plt.scatter(df_origianl_removed_pathways[features[i]], df_origianl_removed_pathways[features[j]], alpha=0.5, label='Removed', color='red')
        plt.scatter(df_deduplicated_all_metadata[features[i]], df_deduplicated_all_metadata[features[j]], alpha=0.5, label='Deduplicated', color='green')
        plt.title(f'{features[i]} vs {features[j]}')
        plt.xlabel(features[i])
        plt.ylabel(features[j])
        plt.legend(loc='upper right')
        plt.show()
        
        if REPORT_TO_PDF:
            PDF.add_figure(fig)
In [ ]:
 
In [ ]:
 
In [ ]:
for feature in features:
    # Create a Figure
    fig = go.Figure()

    # Add the 'Kept' histogram
    fig.add_trace(go.Histogram(
        x=df_origianl_remaining_pathways[feature],
        #histnorm='percent',
        name='Remaining',
        marker_color='#EB89B5',
        nbinsx=20  # specify the number of bins
    ))

    # Add the 'Missing' histogram
    fig.add_trace(go.Histogram(
        x=df_origianl_removed_pathways[feature],
        #histnorm='percent',
        name='Removed',
        marker_color='#330C73',
        nbinsx=20  # specify the number of bins
    ))

    # Add the 'Deduplicated' histogram
    fig.add_trace(go.Histogram(
        x=df_deduplicated_all_metadata[feature],
        #histnorm='percent',
        name='Deduplicated',
        marker_color='green',
        nbinsx=20  # specify the number of bins
    ))

    # Finalize layout
    fig.update_layout(
        title_text=f'Distribution of {feature}', # title of plot
        xaxis_title_text=f'{feature}', # xaxis label
        yaxis_title_text='Count', # yaxis label
        bargap=0.2, # gap between bars of adjacent location coordinates
        bargroupgap=0.1, # gap between bars of the same location coordinates
        barmode='group', # bars are placed next to each other
        autosize=False,
        width=800,
        height=500
    )
    
    # log x-axis
    # fig.update_yaxes(type="log")

    fig.show()
    
    if REPORT_TO_PDF:
        PDF.add_figure(fig)
In [ ]:
 
In [ ]:
for feature in features:
    # Extract the feature values for the kept, missing and deduplicated datasets
    kept_values = df_origianl_remaining_pathways[feature].mean()
    missing_values = df_origianl_removed_pathways[feature].mean()
    deduplicated_values = df_deduplicated_all_metadata[feature].mean()
    
    # Convert the means to a DataFrame for plotly
    df_means = pd.DataFrame(dict(
        r=[kept_values, missing_values, deduplicated_values],
        theta=['Remaining', 'Removed', 'Deduplicated']
    ))

    # Create radar plots
    fig = go.Figure()

    # Add traces for Kept, Missing and Deduplicated datasets
    fig.add_trace(go.Scatterpolar(
        name=feature,
        r=df_means['r'],
        theta=df_means['theta'],
        fill='toself'
    ))

    # Finalize layout
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, df_means['r'].max()]
            )
        ),
        showlegend=False,
        title=f"Spider Chart for {feature}: Kept vs Missing vs Deduplicated Pathways"
    )

    fig.show()
    
    if REPORT_TO_PDF:
        PDF.add_figure(fig)
In [ ]:
 
In [ ]:
 
In [ ]:
 

Export PDF¶

In [ ]:
if REPORT_TO_PDF:
    PDF.export()