Skip to content
This repository has been archived by the owner on Aug 19, 2019. It is now read-only.

Intento por subir mis tareas: German y Algas #31

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions EduardoHidalgo/algas/00-load.R
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
algas_data <- load()
15 changes: 15 additions & 0 deletions EduardoHidalgo/algas/01-prepare.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
colnames(algas_data) <- algas_colnames

#german_data$good_loan <- as.factor(
# ifelse(
# german_data$good_loan == 1,
# 'GoodLoan',
# 'BadLoan'
# )
#)

#german_data <- german_data %>%
# mutate_all(funs(german_decode))

#german_data <- german_data %>%
# mutate_at(c(1,3,4,6,7,9,10,12,14,15,17,19,20),funs(as.factor))
13 changes: 13 additions & 0 deletions EduardoHidalgo/algas/02-clean.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
colnames(algas_data) <- algas_clean_colnames(algas_colnames)

problematic_rows <- problems(algas_data)$row

algas_data[problematic_rows,] <- algas_data %>%
slice(problematic_rows) %>%
unite(col="all", -seq(1:6), sep = "/", remove=TRUE) %>%
extract(all, into=c("NO3", "NH4", "resto"), regex="([0-9]*.[0-9]{5})([0-9]*.[0-9]*)/(.*)/NA", remove=TRUE) %>%
separate(resto, into=names(algas_data)[9:18], sep="/", remove=TRUE)

algas_data <- algas_data %>% mutate_at(c(2,3), funs(algas_clean_data))

algas_data <- readr::type_convert(algas_data)
79 changes: 79 additions & 0 deletions EduardoHidalgo/algas/algas.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
setwd("~/GitHub/MineriaYAnalisisDeDatos/algas")

library(readr)
library(stringr)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggthemes)

source("metadata.R")
source("utils.R")
source("00-load.R")
source("01-prepare.R")
source("02-clean.R")

summary(algas_data)

glimpse(algas_data)

problems(algas_data)


library(mice)
md.pattern(algas_data)

library("VIM")
aggr(algas_data, prop=FALSE, numbers=TRUE)

matrixplot(algas_data)


x <- as.data.frame(abs(is.na(algas_data))) # df es un data.frame

head(algas_data)

head(x)

# Extrae las variables que tienen algunas celdas con NAs
y <- x[which(sapply(x, sd) > 0)]

# Da la correación un valor alto positivo significa que desaparecen juntas.
cor(y,y)

summary(algas_data[-grep(colnames(algas_data),pattern = "^a[1-9]")])

algas_con_NAs <- algas_data[!complete.cases(algas_data),]

algas_con_NAs[c('max_ph', 'min_o2', 'cl', 'no3', 'nh4', 'opo4', 'po4', 'chla')] %>%
print(n = 33)


algas_data %>%
select(-c(1:3)) %>%
cor(use="complete.obs") %>%
symnum()

ggplot(data=algas_data) +
aes(x=opo4, y=po4) +
geom_point(shape=1) + # Usamos una bolita para los puntos
geom_smooth(method=lm, se=FALSE) +
theme_hc()
# Mostramos la linea de la regresión y no mostramos la región de confianza


###


c2 <- ggplot(algas_data, aes(max_ph)) +
geom_histogram(aes(y = ..density..), binwidth=1) +
geom_density()+
xlab("PH Maximo") + ylab("") + ggtitle("Distribucion Empirica del PH Maximo Por Estacion")+facet_wrap(~ season, nrow = 3)

p2 <- ggplot(father.son, aes(fheight)) +
geom_histogram(aes(y = ..density..), binwidth=1) +
geom_density() + xlim(58, 80) + ylim(0, 0.16) +
xlab("ht (inches)") + ylab("") +
ggtitle("Fathers")

grid.arrange(c2, p2, nrow = 1)
Binary file added EduardoHidalgo/algas/algas.rds
Binary file not shown.
81 changes: 81 additions & 0 deletions EduardoHidalgo/algas/metadata.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
.## German credit ---------------------------------------------------------------

## Nombres de columnas ---------------------------------------------------------
german_colnames <- c('Status of existing checking account',
'Duration in month',
'Credit history',
'Purpose',
'Credit amount',
'Savings account/bonds',
'Present employment since',
'Installment rate in percentage of disposable income',
'Personal status and sex',
'Other debtors / guarantors',
'Present residence since',
'Property',
'Age in years',
'Other installment plans',
'Housing',
'Number of existing credits at this bank',
'Job',
'Number of people being liable to provide maintenance for',
'Telephone',
'foreign worker',
'good_loan'
)

## Códigos ---------------------------------------------------------------------
german_codes <- list('A11'='... < 0 DM',
'A12'='0 <= ... < 200 DM',
'A13'='... >= 200 DM / salary assignments for at least 1 year',
'A14'='no checking account',
'A30'='no credits taken/all credits paid back duly',
'A31'='all credits at this bank paid back duly',
'A32'='existing credits paid back duly till now',
'A33'='delay in paying off in the past',
'A34'='critical account/other credits existing (not at this bank)',
'A40'='car (new)',
'A41'='car (used)',
'A42'='furniture/equipment',
'A43'='radio/television', 'A44'='domestic appliances', 'A45'='repairs',
'A46'='education', 'A47'='(vacation - does not exist?)',
'A48'='retraining', 'A49'='business', 'A410'='others', 'A61'='... < 100 DM',
'A62'='100 <= ... < 500 DM', 'A63'='500 <= ... < 1000 DM',
'A64'='.. >= 1000 DM', 'A65'='unknown/ no savings account',
'A71'='unemployed', 'A72'='... < 1 year', 'A73'='1 <= ... < 4 years',
'A74'='4 <= ... < 7 years', 'A75'='.. >= 7 years', 'A91'='male : divorced/separated',
'A92'='female : divorced/separated/married',
'A93'='male : single',
'A94'='male : married/widowed',
'A95'='female : single',
'A101'='none',
'A102'='co-applicant',
'A103'='guarantor', 'A121'='real estate',
'A122'='if not A121 : building society savings agreement/life insurance',
'A123'='if not A121/A122 : car or other, not in attribute 6',
'A124'='unknown / no property',
'A141'='bank', 'A142'='stores', 'A143'='none', 'A151'='rent', 'A152'='own',
'A153'='for free', 'A171'='unemployed/ unskilled - non-resident',
'A172'='unskilled - resident', 'A173'='skilled employee / official',
'A174'='management/ self-employed/highly qualified employee/ officer',
'A191'='none', 'A192'='yes, registered under the customers name',
'A201'='yes', 'A202'='no'
)


## Algas -----------------------------------------------------------------------

## Nombre de columnas ----------------------------------------------------------
algas_colnames <- c('season',
'river_size',
'fluid_velocity',
'max_PH',
'min_O2',
'Cl',
'NO3',
'NH4',
'oPO4',
'PO4',
'Chla',
paste('a', seq(1:7), sep="")
)
34 changes: 34 additions & 0 deletions EduardoHidalgo/algas/utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@

load <- function(){
if(!file.exists('algas.rds')){
algas_url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/coil-mld/analysis.data'

algas_data <- read_csv(algas_url,
col_names = algas_colnames,
na = 'XXXXXXX')
saveRDS(algas_data, "algas.rds")
print('algas.rds se bajó y guardó\n')
}
else{
warning('algas.rds ya existe\n')
algas_data <- readRDS("algas.rds")
}

return(algas_data)
}

algas_decode <- function(columna){
if(is.character(columna)){
unlist(german_codes[columna],use.names = F)
}else{
columna
}
}

algas_clean_colnames <- function(x){
str_replace_all(tolower(x),"/| ",'_')
}

algas_clean_data <- function(x){
str_replace_all(tolower(x),"_",'')
}
1 change: 1 addition & 0 deletions EduardoHidalgo/german/00-load.R
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
german_data <- load()
15 changes: 15 additions & 0 deletions EduardoHidalgo/german/01-prepare.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
colnames(german_data) <- german_colnames

german_data$good_loan <- as.factor(
ifelse(
german_data$good_loan == 1,
'GoodLoan',
'BadLoan'
)
)

german_data <- german_data %>%
mutate_all(funs(german_decode))

german_data <- german_data %>%
mutate_at(c(1,3,4,6,7,9,10,12,14,15,17,19,20),funs(as.factor))
1 change: 1 addition & 0 deletions EduardoHidalgo/german/02-clean.R
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
colnames(german_data) <- german_clean_colnames(german_colnames)
37 changes: 37 additions & 0 deletions EduardoHidalgo/german/german.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
setwd("~/GitHub/MineriaYAnalisisDeDatos/german")

library(readr)
library(stringr)
library(dplyr)
library(ggplot2)
library(ggthemes)

source("metadata.R")
source("utils.R")
source("00-load.R")
source("01-prepare.R")
source("02-clean.R")

ggplot(data = german_data) +
geom_bar(mapping = aes(x = `personal_status_and_sex`, fill = `good_loan`), position = "fill")+
theme(axis.text.x = element_text(angle = 60, hjust = 1))

ggplot(data = german_data) +
geom_bar(mapping = aes(x = `good_loan`, fill = `credit_history`), position = "fill")

ggplot(data = german_data) +
geom_bar(mapping = aes(x = `credit_history`, fill = `good_loan`), position = "fill")+
theme(axis.text.x = element_text(angle = 70, hjust = 1))

german_data %>%
group_by(credit_history) %>%
dplyr::summarise(count = n()) %>%
arrange(desc(count)) %>%
ggplot(.) +
geom_bar(aes(x=reorder(credit_history, count), y = count), stat="identity", fill="gray") +
coord_flip() +
theme_hc() +
ylab('casos') +
xlab('Historial de crédito')

summary(german_data)
13 changes: 13 additions & 0 deletions EduardoHidalgo/german/german.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: ISO8859-1

RnwWeave: Sweave
LaTeX: pdfLaTeX
Binary file added EduardoHidalgo/german/german.rds
Binary file not shown.
Binary file added EduardoHidalgo/german/german.zip
Binary file not shown.
81 changes: 81 additions & 0 deletions EduardoHidalgo/german/metadata.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
## German credit ---------------------------------------------------------------

## Nombres de columnas ---------------------------------------------------------
german_colnames <- c('Status of existing checking account',
'Duration in month',
'Credit history',
'Purpose',
'Credit amount',
'Savings account/bonds',
'Present employment since',
'Installment rate in percentage of disposable income',
'Personal status and sex',
'Other debtors / guarantors',
'Present residence since',
'Property',
'Age in years',
'Other installment plans',
'Housing',
'Number of existing credits at this bank',
'Job',
'Number of people being liable to provide maintenance for',
'Telephone',
'foreign worker',
'good_loan'
)

## Códigos ---------------------------------------------------------------------
german_codes <- list('A11'='... < 0 DM',
'A12'='0 <= ... < 200 DM',
'A13'='... >= 200 DM / salary assignments for at least 1 year',
'A14'='no checking account',
'A30'='no credits taken/all credits paid back duly',
'A31'='all credits at this bank paid back duly',
'A32'='existing credits paid back duly till now',
'A33'='delay in paying off in the past',
'A34'='critical account/other credits existing (not at this bank)',
'A40'='car (new)',
'A41'='car (used)',
'A42'='furniture/equipment',
'A43'='radio/television', 'A44'='domestic appliances', 'A45'='repairs',
'A46'='education', 'A47'='(vacation - does not exist?)',
'A48'='retraining', 'A49'='business', 'A410'='others', 'A61'='... < 100 DM',
'A62'='100 <= ... < 500 DM', 'A63'='500 <= ... < 1000 DM',
'A64'='.. >= 1000 DM', 'A65'='unknown/ no savings account',
'A71'='unemployed', 'A72'='... < 1 year', 'A73'='1 <= ... < 4 years',
'A74'='4 <= ... < 7 years', 'A75'='.. >= 7 years', 'A91'='male : divorced/separated',
'A92'='female : divorced/separated/married',
'A93'='male : single',
'A94'='male : married/widowed',
'A95'='female : single',
'A101'='none',
'A102'='co-applicant',
'A103'='guarantor', 'A121'='real estate',
'A122'='if not A121 : building society savings agreement/life insurance',
'A123'='if not A121/A122 : car or other, not in attribute 6',
'A124'='unknown / no property',
'A141'='bank', 'A142'='stores', 'A143'='none', 'A151'='rent', 'A152'='own',
'A153'='for free', 'A171'='unemployed/ unskilled - non-resident',
'A172'='unskilled - resident', 'A173'='skilled employee / official',
'A174'='management/ self-employed/highly qualified employee/ officer',
'A191'='none', 'A192'='yes, registered under the customers name',
'A201'='yes', 'A202'='no'
)


## Algas -----------------------------------------------------------------------

## Nombre de columnas ----------------------------------------------------------
algas_colnames <- c('season',
'river_size',
'fluid_velocity',
'max_PH',
'min_O2',
'Cl',
'NO3',
'NH4',
'oPO4',
'PO4',
'Chla',
paste('a', seq(1:7), sep="")
)
Loading