42. 로지스틱 회귀 분석(예제)
in Study on R Programming
Overview
회귀 분석의 방법 중, 로지스틱 회귀 분석을 통한 예제를 학습한다.
로지스틱 회귀 예제 1
library(ROCR)
data <- read.csv('weather.csv', header = T)
str(data)
colnames(data)
unique(data$RainTomorrow)
# 로지스틱 회귀는 확률로 따지는 것이므로 숫자화
data$RainTomorrow <- as.character(data$RainTomorrow)
# recode 비온다(1), 안온다(0)
data$RainTomorrow[data$RainTomorrow == 'Yes'] <- 1
data$RainTomorrow[data$RainTomorrow == 'No'] <- 0
data$RainTomorrow <- as.numeric(data$RainTomorrow)
data$RainTomorrow
str(data$RainTomorrow)
dim(data)
table(data$RainTomorrow)
# 학습용과 점검용으로 데이터 분석
train_row <- round(0.7 * nrow(data))
train_row
training <- data[1:train_row,]
testing <- data[(train_row + 1):nrow(data), ]
dim(training)
dim(testing)
colnames(data)
# family = 'binomial' : 이항 분류를 위한 옵션
model <- glm(RainTomorrow ~ ., data = training, family = 'binomial')
summary(model)
model
# sigmoid function : 일반적인 수를 0에서 1까지로 변환
# 이것을 0.5를 기준으로 0과 1로 변환하여 로지스틱 회귀분석
pred <- predict(model, newdata = testing, type = 'response')
head(pred)
head(testing$RainTomorrow)
# > head(pred)
# 257 258 259 260 261 262
# 0.017666666 0.093528864 0.240739871 0.100249740 0.153547394 0.008485457
# > head(testing$RainTomorrow)
# [1] 0 0 0 0 1 0
# 예측치 -> 이항형으로 변환
res_pred <- ifelse(pred >= 0.5, 1, 0)
res_pred
mTable <- table(res_pred, testing$RainTomorrow)
mTable
# res_pred 0 1
# 0 90 9
# 1 1 7
accuracy <- (90 + 7) / sum(mTable)
accuracy
# predict, prediction : 예측치
# labels : 정답(answer)
pr <- prediction(predictions = pred, testing$RainTomorrow)
prf <- performance(pr, measure = 'tpr', x.measure = 'fpr')
plot(prf, main = 'ROC Curve')
로지스틱 회귀 예제 2
library(ROCR)
data <- read.csv('pima-indians-diabetes.csv', header = F)
str(data)
summary(data)
ncol(data)
# V9이 종속 변수
dim(data)
# set.seed(1234)
idx <- sample(1:nrow(data), 0.7 * nrow(data))
train <- data[idx,]
test <-data[-idx,]
dim(train)
dim(test)
# > dim(train)
# [1] 537 9
# > dim(test)
# [1] 231 9
model <- glm(V9 ~ ., data = train, family = 'binomial')
pred <- predict(model, newdata = test, type = 'response')
pred_res <- ifelse(pred >= 0.5, 1, 0)
head(pred_res)
head(test$V9)
# > head(pred_res)
# 9 10 12 13 14 17
# 1 0 1 1 1 0
# > head(test$V9)
# [1] 1 1 1 0 1 1
mTable <- table(pred_res, test$V9)
a <- mTable[1, 1]
b <- mTable[2, 2]
acc <- (a + b) / sum(mTable)
accPer <- paste(round(100 * acc, 2), '%', sep = '')
# K겹 교차 검증
pr <- prediction(pred, test$V9)
prf <- performance(pr, measure = 'tpr', x.measure = 'fpr')
mainText <- paste('ROC Curve', accPer)
plot(prf, main = mainText)
로지스틱 회귀 예제 3
library(ROCR)
data <- read.csv('wine.csv', header = F)
str(data)
summary(data)
idx <- sample(1:nrow(data), 0.7 * nrow(data))
training <- data[idx, ]
testing <- data[-idx, ]
# 로지스틱 회귀 모델
model <- glm(V13 ~ ., data = training, family = 'binomial')
coef(model)
# 로지스틱 회귀모델 예측치 생성
# model을 이용하여 testing 데이터 검증
pred <- predict(model, newdata = testing, type = 'response')
head(pred)
# recode
pred_res <- ifelse(pred >= 0.5, 1, 0)
head(pred_res)
table(pred_res)
# pred_res
# 0 1
# 1476 474
conf <- table(pred_res, testing$V13)
conf
# pred_res 0 1
# 0 1468 8
# 1 6 468
acc <- (conf[1, 1] + conf[2, 2]) / sum(conf)
acc
accPer <- paste(round(100 * acc, 2), '%', sep = '')
# [1] 0.9928205
# ROC Curve를 이용한 모델 평가
pr <- prediction(pred, testing$V13)
prf <- performance(pr, measure = 'tpr', x.measure = 'fpr')
mainText <- paste('ROC Curve', accPer)
plot(prf, main = mainText)
로지스틱 회귀 예제 4
data <- read.table('housing.csv', header = F)
head(data)
str(data)
summary(data)
colnames(data) <- c('CRIM', 'ZIN', 'INDUS',
'CHAS', 'NOX', 'RM',
'AGE', 'DIS', 'RAD',
'TAX', 'PTRAITO', 'B',
'LSTAT', 'PRICE')
head(data)
# 학습용, 훈련용 데이터 분리
testing_row <- 10
training_row <- nrow(data) - testing_row
training <- data[1:training_row, ]
testing <- data[(training_row+1):nrow(data), ]
nrow(training) + nrow(testing)
# 모델 생성
model <- lm(formula <- PRICE ~ ., data = training)
model
# 선형 회귀 결과 값 추출
# 회귀 계수
coef(model)
# 적합 값
fitted(model)[1:7]
# 잔차(residuals)
residuals(model)[1:7]
# 적합된 값과 잔차의 합은 실제 데이터의 합과 같다음
fitted(model)[1:7] + residuals(model)[1:7]
data[c(1:7), c('PRICE') ]
# 예측과 신뢰 구간
pred <- predict(model, newdata = testing, interval="confidence")
pred
# 모델 평가
summary(model)
# Call:
# lm(formula = formula <- PRICE ~ ., data = training)
#
# Residuals:
# Min 1Q Median 3Q Max
# -15.700 -2.665 -0.525 1.869 25.962
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 35.405559 5.145942 6.880 1.86e-11 ***
# CRIM -0.105122 0.032942 -3.191 0.001510 **
# ZIN 0.048787 0.013782 3.540 0.000439 ***
# INDUS 0.021118 0.062052 0.340 0.733765
# CHAS 2.609734 0.863178 3.023 0.002633 **
# NOX -17.035131 3.844504 -4.431 1.16e-05 ***
# RM 3.801268 0.419774 9.056 < 2e-16 ***
# AGE 0.002384 0.013268 0.180 0.857460
# DIS -1.509027 0.201412 -7.492 3.26e-13 ***
# RAD 0.292501 0.067016 4.365 1.56e-05 ***
# TAX -0.012648 0.003778 -3.347 0.000880 ***
# PTRAITO -0.886459 0.134816 -6.575 1.26e-10 ***
# B 0.009244 0.002691 3.436 0.000642 ***
# LSTAT -0.542078 0.051189 -10.590 < 2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
# Residual standard error: 4.75 on 482 degrees of freedom
# Multiple R-squared: 0.7442, Adjusted R-squared: 0.7373
# F-statistic: 107.8 on 13 and 482 DF, p-value: < 2.2e-16
# 예측 값과 정답
testing$PRICE
df <- data.frame(prediction=pred, real_Price=testing$PRICE)
df
# prediction.fit prediction.lwr prediction.upr real_Price
# 497 14.05287 12.91404 15.19170 19.7
# 498 19.25956 18.36332 20.15580 18.3
# 499 21.47470 20.48618 22.46323 21.2
# 500 18.61183 17.58406 19.63960 17.5
# 501 20.64221 19.74371 21.54071 16.8
# 502 23.98176 22.60630 25.35723 22.4
# 503 22.85730 21.48604 24.22856 20.6
# 504 28.18949 26.60877 29.77020 23.9
# 505 26.66713 25.16609 28.16817 22.0
# 506 22.84702 21.43355 24.26049 11.9