Skip to content

Commit

Permalink
Add xtra step of cleaning quotation marks from gje data, and output c…
Browse files Browse the repository at this point in the history
…sv of data from new green skills notebook
  • Loading branch information
lizgzil committed Apr 8, 2024
1 parent 75ea4f6 commit d785c8a
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 35 deletions.
4 changes: 4 additions & 0 deletions dap_prinz_green_jobs/analysis/ojo_analysis/gje_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,15 @@ def decap_inds(top_5_sics):
# Format all the single quotes to be double quotes (needed for the GJE)
# Due to the pandas saving dicts as single quotes, we need to read it in
# with the dict columns as strings, and then change them like this.

def clean_quotes(phrase):
phrase = str(phrase)
phrase = re.sub(
r"s\' ([a-zA-Z])", r"s \1", phrase
) # "other builders\' carpentry" (this messes the data up a bit, so just remove)
phrase = re.sub(
r"([a-zA-Z])\'s", r"\1s", phrase
) # "style a dog\'s coat" (this messes the data up a bit, so just remove)
phrase = phrase.replace("'", '"') # Replace all other single quotes with double
return phrase

Expand Down
79 changes: 44 additions & 35 deletions dap_prinz_green_jobs/notebooks/new_skills_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@
"output_type": "stream",
"text": [
"20240404\n",
"2024-04-08 10:03:02,384 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials\n"
"2024-04-08 12:34:42,390 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials\n"
]
}
],
Expand All @@ -155,7 +155,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"2024-04-08 10:03:05,471 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials\n"
"2024-04-08 12:34:43,192 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials\n"
]
},
{
Expand All @@ -170,7 +170,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"2024-04-08 10:05:25,745 - dap_prinz_green_jobs - INFO - Loading skills taxonomies\n"
"2024-04-08 12:37:04,152 - dap_prinz_green_jobs - INFO - Loading skills taxonomies\n"
]
}
],
Expand Down Expand Up @@ -299,7 +299,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -308,7 +308,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -319,7 +319,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -329,7 +329,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -342,7 +342,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"outputs": [
{
Expand All @@ -355,7 +355,7 @@
"Name: occupation_greenness, dtype: int64"
]
},
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -366,7 +366,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -384,7 +384,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"metadata": {},
"outputs": [
{
Expand All @@ -401,7 +401,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -414,7 +414,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -433,7 +433,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -464,7 +464,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 22,
"metadata": {},
"outputs": [
{
Expand All @@ -491,7 +491,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -509,7 +509,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 24,
"metadata": {},
"outputs": [
{
Expand All @@ -518,7 +518,7 @@
"13126"
]
},
"execution_count": 23,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -529,14 +529,14 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████████████████████████████████████████████████████████████████████████████| 13126/13126 [00:00<00:00, 18070.78it/s]\n"
"100%|██████████████████████████████████| 13126/13126 [00:00<00:00, 17785.40it/s]\n"
]
}
],
Expand All @@ -555,7 +555,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 26,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -591,7 +591,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -602,7 +602,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 28,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -632,7 +632,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -656,7 +656,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 30,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -700,7 +700,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -726,7 +726,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -783,31 +783,31 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"<style>\n",
" #altair-viz-3bcd7723b36b46aaa6034533b32a7db8.vega-embed {\n",
" #altair-viz-3972a682ef9d49ac939f8169f0e704ee.vega-embed {\n",
" width: 100%;\n",
" display: flex;\n",
" }\n",
"\n",
" #altair-viz-3bcd7723b36b46aaa6034533b32a7db8.vega-embed details,\n",
" #altair-viz-3bcd7723b36b46aaa6034533b32a7db8.vega-embed details summary {\n",
" #altair-viz-3972a682ef9d49ac939f8169f0e704ee.vega-embed details,\n",
" #altair-viz-3972a682ef9d49ac939f8169f0e704ee.vega-embed details summary {\n",
" position: relative;\n",
" }\n",
"</style>\n",
"<div id=\"altair-viz-3bcd7723b36b46aaa6034533b32a7db8\"></div>\n",
"<div id=\"altair-viz-3972a682ef9d49ac939f8169f0e704ee\"></div>\n",
"<script type=\"text/javascript\">\n",
" var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
" (function(spec, embedOpt){\n",
" let outputDiv = document.currentScript.previousElementSibling;\n",
" if (outputDiv.id !== \"altair-viz-3bcd7723b36b46aaa6034533b32a7db8\") {\n",
" outputDiv = document.getElementById(\"altair-viz-3bcd7723b36b46aaa6034533b32a7db8\");\n",
" if (outputDiv.id !== \"altair-viz-3972a682ef9d49ac939f8169f0e704ee\") {\n",
" outputDiv = document.getElementById(\"altair-viz-3972a682ef9d49ac939f8169f0e704ee\");\n",
" }\n",
" const paths = {\n",
" \"vega\": \"https://cdn.jsdelivr.net/npm/vega@5?noext\",\n",
Expand Down Expand Up @@ -860,7 +860,7 @@
"alt.Chart(...)"
]
},
"execution_count": 32,
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -869,6 +869,15 @@
"graph_config"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"top_nongreen_skills_df.to_csv(f\"{graph_dir}/top_nongreen_skills_by_degree_centrality.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down

0 comments on commit d785c8a

Please sign in to comment.