Skip to content

Commit

Permalink
fix double quote issue in gje formamting
Browse files Browse the repository at this point in the history
  • Loading branch information
lizgzil committed Apr 8, 2024
1 parent fa6cb91 commit 75ea4f6
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions dap_prinz_green_jobs/analysis/ojo_analysis/gje_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import os
import ast
import re

import pandas as pd

Expand Down Expand Up @@ -55,6 +56,13 @@ def decap_inds(top_5_sics):
# Format all the single quotes to be double quotes (needed for the GJE)
# Due to the pandas saving dicts as single quotes, we need to read it in
# with the dict columns as strings, and then change them like this.
def clean_quotes(phrase):
phrase = str(phrase)
phrase = re.sub(
r"s\' ([a-zA-Z])", r"s \1", phrase
) # "other builders\' carpentry" (this messes the data up a bit, so just remove)
phrase = phrase.replace("'", '"') # Replace all other single quotes with double
return phrase

for col_name in [
"top_5_socs",
Expand All @@ -64,8 +72,8 @@ def decap_inds(top_5_sics):
"top_5_itl2_quotient",
"top_5_similar_occs",
]:
occ_agg_extra_loaded[col_name] = occ_agg_extra_loaded[col_name].str.replace(
"'", '"'
occ_agg_extra_loaded[col_name] = occ_agg_extra_loaded[col_name].apply(
clean_quotes
)

# Remove betting shop managers as they have a quirk which means many of them
Expand Down

0 comments on commit 75ea4f6

Please sign in to comment.