import pandas as pd
import numpy as np
import ssl
from plotnine import *
from dfply import *
from wordcloud import *
import matplotlib.pyplot as plt
import spacy
import spacy.cli
"en_core_web_sm")
spacy.cli.download(= spacy.load("en_core_web_sm")
nlp
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
0. Loading Libraries
1. Data Loading
= ssl._create_unverified_context
ssl._create_default_https_context
= pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-02-20/isc_grants.csv")
isc_grants
# Displaying first 5 rows before data transformation
5) isc_grants.head(
year | group | title | funded | proposed_by | summary | website | |
---|---|---|---|---|---|---|---|
0 | 2023 | 1 | The future of DBI (extension 1) | 10000 | Kirill Müller | This proposal mostly focuses on the maintenanc... | NaN |
1 | 2023 | 1 | Secure TLS Communications for R | 10000 | Charlie Gao | The project aims to implement secure connectio... | NaN |
2 | 2023 | 1 | volcalc: Calculate predicted volatility of che... | 12265 | Kristina Riemer | This ISC funded project focuses on the develop... | NaN |
3 | 2023 | 1 | autotest: Automated testing of R packages | 3000 | Mark Padgham | The project aims to develop an R package to au... | NaN |
4 | 2023 | 1 | api2r: An R Package for Auto-Generating R API ... | 15750 | Jon Harmon | This project aims to develop an R package call... | NaN |
2. Data Transformation & Clustering
Code for Transformation and Clustering
# Extracting Proper Nouns using prebuilt model of spacy
'keywords']= [[token.text for token in doc if token.pos_ == "PROPN"] for doc in isc_grants.summary.map(nlp)]
isc_grants[
# Extracting L2_form for Prpoer Nouns using prebuilt model of spacy
'L2_form_keywords'] = [[token.vector_norm for token in doc if token.pos_ == "PROPN"] for doc in isc_grants.summary.map(nlp)]
isc_grants[
# Adding the numbers of L2Form to arrive at a final number
'L2_sum_keywords'] = isc_grants['L2_form_keywords'].map(lambda x : np.sum(x))
isc_grants[
# Kmeans Clustering Code; I like using pipelines.
=Pipeline(steps=[
pipe'standarsclar',StandardScaler()),
('kmeans',KMeans(n_clusters=4, random_state=42))
(
])'clusters']=pipe.fit_predict(isc_grants[['funded','L2_sum_keywords']])
isc_grants[
# Displaying first 5 rows after data transformation
5) isc_grants.head(
year | group | title | funded | proposed_by | summary | website | keywords | L2_form_keywords | L2_sum_keywords | clusters | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023 | 1 | The future of DBI (extension 1) | 10000 | Kirill Müller | This proposal mostly focuses on the maintenanc... | NaN | [DBI, DBItest, RSQLite, RMariaDB, RPostgres, OS] | [8.515169, 8.737299, 9.32899, 7.872729, 7.7452... | 50.481686 | 2 |
1 | 2023 | 1 | Secure TLS Communications for R | 10000 | Charlie Gao | The project aims to implement secure connectio... | NaN | [TLS] | [7.7507687] | 7.750769 | 2 |
2 | 2023 | 1 | volcalc: Calculate predicted volatility of che... | 12265 | Kristina Riemer | This ISC funded project focuses on the develop... | NaN | [ISC, algorithm] | [8.402423, 8.565787] | 16.968210 | 2 |
3 | 2023 | 1 | autotest: Automated testing of R packages | 3000 | Mark Padgham | The project aims to develop an R package to au... | NaN | [GPL-3, CRAN] | [7.7737813, 7.8731694] | 15.646951 | 2 |
4 | 2023 | 1 | api2r: An R Package for Auto-Generating R API ... | 15750 | Jon Harmon | This project aims to develop an R package call... | NaN | [api2r, OpenAPI, Specification, OAS, OAS, R.] | [7.429026, 6.894724, 7.6077356, 9.437281, 7.47... | 47.943073 | 2 |
3. Plotting
3.1. Total Funding Per Cluster
Code for Segment Plot by Cluster
= (isc_grants >>
plot >>
group_by(X.clusters) = np.sum(X.funded), count = n(X.clusters) ) >>
summarize(fun = 'clusters' , x = 'fun' )) +
ggplot(aes(y ="*",size=4, color="blue") + geom_segment(aes(xend=0, yend='clusters'), color="green", size=1) + theme_minimal() +
geom_point(shape
labs(= "Total Funded Amount in $",
x = "Cluster",
y = "Total Amount of Funding per Cluster"
title
)+scale_x_continuous(labels=lambda x: ['${:.0f}K'.format(i / 1000) for i in x])
)
"Segment.png", width=8, height=6) plot.save(
Interpreation
- Cluster
0
is characterized by a high level of funding, whereas Cluster3
exhibits relatively lower funding.
3.2. Total Funding vs Total Grants Relationship
Code for Scatter Plot between funding amount and grants
= (isc_grants >>
plot >>
group_by(X.year ) = np.sum(X.funded), count = n(X.clusters) ) >>
summarize(fun = 'fun' , y = 'count' )) +
ggplot(aes(x ="blue") + geom_smooth(method='lm',se=False,color="green") +
geom_point(color+
theme_minimal()
labs (= "Funding Amount",
x = "Number of Grants",
y = "Linear Relationship ",
title = "b/w #grants & funding$"
subtitle +
) =lambda x: ['${:.0f}K'.format(i / 1000) for i in x])
scale_x_continuous(labels
)
"Funding.png", width=8, height=6) plot.save(
Interpreation
- A clear linear relationship is evident between the number of grants and the funding amount
3.3. Funding Amount Per Cluster Over Years
Code for Plot of Funding Amount per Cluster Over Years
= (isc_grants >>
plot1 >>
group_by(X.year , X.clusters) = np.sum(X.funded), count = n(X.clusters) ) >>
summarize(fun = 'clusters' , y = 'fun' )) +
ggplot(aes(x ="blue") +
geom_col(fill'~year') + theme_minimal() + theme(figure_size=(4, 3)) +
facet_wrap(=lambda x: ['${:.0f}K'.format(i / 1000) for i in x]) +
scale_y_continuous(labels
labs (= "Clusters",
x = "Funding Amount in $",
y = "Funding Amount across Clusters YonY"
caption
)
)
"Funding_Clusters.png", width=4, height=3)
plot1.save(
= (isc_grants >>
plot2 >>
group_by(X.year , X.clusters) = np.sum(X.funded), count = n(X.clusters) ) >>
summarize(fun = 'clusters' , y = 'count' )) +
ggplot(aes(x ="orange") +
geom_col(fill'~year') + theme_minimal() + theme(figure_size=(4, 3)) +
facet_wrap(
labs (= "Clusters",
x = "Number of Grants",
y = "Number of Grants across Clusters YonY"
caption
)
)
"Count_Clusters.png", width=4, height=3) plot2.save(
Interpreation
- A nearly identical linear relationship between the number of grants and funding amount is apparent, indicating a consistent funding amount per grant.
- Furthermore, distinct funding values emerge across the clusters, highlighting variations in funding amounts despite similar numbers of grants within each cluster.
3.4. Combined WordCloud
Code for Combined Words of WordCloud
= ' '.join(isc_grants['keywords'].sum())
combined_text
= (
wordcloud
WordCloud( =50,
max_font_size=100,
max_words="white")
background_color
.generate(combined_text)
)
=(8, 6))
plt.figure(figsize='bilinear')
plt.imshow(wordcloud, interpolation'off')
plt.axis("Combined WordCloud",fontsize=16 )
plt.title('wordcloud_image.png') plt.savefig(
Interpreation
- The prevalence of
CRAN
andISC
is noteworthy, and it would have been beneficial to exclude them for a more focused analysis. - Additionally, within the R consortium grants, there is notable diversity with the inclusion of topics such as
JavaScript
,DBI
, andPython
. This suggests a robust research scope, particularly in the realm of multiple integrations.
3.4. WordCloud by Cluster
Code for WordCloud by Cluster
= ' '.join(isc_grants[isc_grants['clusters'] == 0]['keywords'].sum())
combined_text1 = ' '.join(isc_grants[isc_grants['clusters'] == 1]['keywords'].sum())
combined_text2 = ' '.join(isc_grants[isc_grants['clusters'] == 2]['keywords'].sum())
combined_text3 = ' '.join(isc_grants[isc_grants['clusters'] == 3]['keywords'].sum())
combined_text4
= (
wordcloud1
WordCloud( =50,
max_font_size=100,
max_words="white")
background_color
.generate(combined_text1)
)
= (
wordcloud2
WordCloud( =50,
max_font_size=100,
max_words="white")
background_color
.generate(combined_text2)
)
= (
wordcloud3
WordCloud( =50,
max_font_size=100,
max_words="white")
background_color
.generate(combined_text3)
)
= (
wordcloud4
WordCloud( =50,
max_font_size=100,
max_words="white")
background_color
.generate(combined_text4)
)
=(4, 3))
plt.figure(figsize='bilinear')
plt.imshow(wordcloud1, interpolation'off')
plt.axis("Cluster 0 WordCloud",fontsize=12)
plt.title('wordcloud_image1.png')
plt.savefig(
=(4, 3))
plt.figure(figsize='bilinear')
plt.imshow(wordcloud2, interpolation'off')
plt.axis("Cluster 1 WordCloud",fontsize=12)
plt.title('wordcloud_image2.png')
plt.savefig(
=(4, 3))
plt.figure(figsize='bilinear')
plt.imshow(wordcloud3, interpolation'off')
plt.axis("Cluster 2 WordCloud",fontsize=12)
plt.title('wordcloud_image3.png')
plt.savefig(
=(4, 3))
plt.figure(figsize='bilinear')
plt.imshow(wordcloud4, interpolation'off')
plt.axis("Cluster 3 WordCloud",fontsize=12)
plt.title('wordcloud_image4.png') plt.savefig(
Interpreation
- In first cluster we see
DBI
andWindows
. - In second cluster we see
Python
andJavaScript
. - In third cluster we see
RSQLite
andRMariaDB
. - In third cluster we see
LLMs
andAWS
.
Certainly, it is evident that clustering has provided valuable insights into the primary themes of research grants. The observation that the average funding amount per grant is relatively consistent does not imply the superiority of one research segment over another. Instead, it indicates that the number of research projects in the second and fourth segments is lower when compared to the first and third segments.