Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WA Preprocessor Bugfix #160

Merged
merged 1 commit into from
Dec 20, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 14 additions & 24 deletions reggie/ingestion/preprocessor/washington_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,17 @@
format_column_name,
)


class PreprocessWashington(Preprocessor):
def __init__(self, raw_s3_file, config_file, force_date=None, **kwargs):

if force_date is None:
force_date = date_from_str(raw_s3_file)

super().__init__(
raw_s3_file=raw_s3_file,
config_file=config_file,
force_date=force_date,
**kwargs
**kwargs,
)
self.raw_s3_file = raw_s3_file
self.processed_file = None
Expand Down Expand Up @@ -81,7 +81,7 @@ def execute(self):
# There's a weird corruption of the county column name
# in the 2021-2022 history file, so fix that:
for c in temp.columns:
if "CountyCode" in c:
if "CountyCode" in c and "voting" not in c.lower():
temp.rename(columns={c: "CountyCode"}, inplace=True)
df_hist = pd.concat([df_hist, temp], ignore_index=True)

Expand All @@ -90,10 +90,8 @@ def execute(self):
# Need to fix/combine the differently named VoterHistoryID
# and VotingHistoryID columns
if {"VotingHistoryID", "VoterHistoryID"}.issubset(df_hist.columns):
df_hist["VotingHistoryID"] = (
df_hist.pop("VoterHistoryID").fillna(
df_hist.pop("VotingHistoryID")
)
df_hist["VotingHistoryID"] = df_hist.pop("VoterHistoryID").fillna(
df_hist.pop("VotingHistoryID")
)

# can't find voter history documentation in any yaml, hardcoding column name
Expand Down Expand Up @@ -131,9 +129,7 @@ def convert_date(k):
all_history = voter_groups["all_history"].apply(list)
sparse_history = voter_groups["sparse_history"].apply(list)
county_history = voter_groups["county_history"].apply(list)
df_hist = pd.concat(
[all_history, sparse_history, county_history], axis=1
)
df_hist = pd.concat([all_history, sparse_history, county_history], axis=1)

# --- handling the voter file --- #
# Aug 2023 - some columns have changed names slightly
Expand All @@ -142,21 +138,16 @@ def convert_date(k):
inplace=True,
)
# some columns have become obsolete
df_voter = df_voter.loc[
:, df_voter.columns.isin(self.config["column_names"])
]
df_voter = df_voter.loc[:, df_voter.columns.isin(self.config["column_names"])]
df_voter = df_voter.set_index(self.config["voter_id"])

# pandas loads any numeric column with NaN values as floats
# causing formatting trouble during execute() with a few columns
# saw this solution in other states (arizona & texas)
to_numeric = [
df_voter.loc[:, col].str.isnumeric().all()
for col in df_voter.columns
df_voter.loc[:, col].str.isnumeric().all() for col in df_voter.columns
]
df_voter.loc[:, to_numeric] = (
df_voter.loc[:, to_numeric].fillna(-1).astype(int)
)
df_voter.loc[:, to_numeric] = df_voter.loc[:, to_numeric].fillna(-1).astype(int)

df_voter = self.config.coerce_numeric(df_voter)
df_voter = self.config.coerce_strings(
Expand All @@ -171,7 +162,7 @@ def convert_date(k):
# add voter history
df_voter = df_voter.join(df_hist)

# Add party_idenitfier dummy values,
# Add party_idenitfier dummy values,
# since WA doesn't have party info
df_voter.loc[:, self.config["party_identifier"]] = NO_PARTY_PLACEHOLDER

Expand All @@ -181,9 +172,9 @@ def convert_date(k):
self.config["status_codes_remap"]
)
if df_voter["StatusCode"].isnull().any():
missing = df_voter[
df_voter["StatusCode"].isnull()
]["StatusCodeOrig"].to_list()
missing = df_voter[df_voter["StatusCode"].isnull()][
"StatusCodeOrig"
].to_list()
logging.warning("Status codes missing from status_codes_remap")
logging.warning(missing)

Expand All @@ -196,8 +187,7 @@ def convert_date(k):

# Make sure all columns are present
expected_cols = (
self.config["ordered_columns"]
+ self.config["ordered_generated_columns"]
self.config["ordered_columns"] + self.config["ordered_generated_columns"]
)
# Remove the index column to avoid duplication
expected_cols.remove(self.config["voter_id"])
Expand Down
Loading