游客满意度调数据(模拟数据,42变量,373条记录)

这是一个随机数模拟的数据,可以练习使用,但是分析结论同学不要太认真

(1)电子问卷

http://kaiwu.city/openfiles/tourist_satisfaction_questionnaire_cn.docx

(2)csv格式调研数据

http://kaiwu.city/openfiles/tourist.csv

(3)Excel格式调研数据

http://kaiwu.city/openfiles/tourist_cn.xlsx

(4)SPSS格式调研数据 http://kaiwu.city/openfiles/data_tourist_cn.sav

1.data preparation数据预处理

data preparation(数据准备)与data wrangling、data munging、data cleaning的含义非常接近 https://theappsolutions.com/blog/development/data-wrangling-guide-to-data-preparation/

Data Wrangling in R (1)Dplyr - essential data-munging R package. Supreme data framing tool. Especially useful for data management operating by categories. (2)Purrr - good for list function operations and error-checking. (3)Splitstackshape - an oldie but goldie. Good for shaping complex data sets and simplifying the visualization. (4)JSOnline - nice and easy parsing tool. (5)Magrittr - good for wrangling scattered sets and putting them into a more coherent form.

1.1 load packages载入拓展功能包

# 最为稳健的方法,检测是否安装,没有安装就安装

if(!isTRUE(require("Hmisc"))){install.packages("Hmisc")}
## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
#数据分析包
if(!isTRUE(require("psych"))){install.packages("psych")}
## Loading required package: psych
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
#数据分析包
if(!isTRUE(require("lavaan"))){install.packages("lavaan")} #结构方程分析
## Loading required package: lavaan
## This is lavaan 0.6-15
## lavaan is FREE software! Please report any bugs.
## 
## Attaching package: 'lavaan'
## The following object is masked from 'package:psych':
## 
##     cor2cov
if(!isTRUE(require("broom"))){install.packages("broom")}# 分析结果整洁输出
## Loading required package: broom
if(!isTRUE(require("ggplot2"))){install.packages("ggplot2")}# 专业绘图,数据可视化
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
if(!isTRUE(require("ggrepel"))){install.packages("ggrepel")} # 绘图的标注(不遮挡)
## Loading required package: ggrepel
if(!isTRUE(require("corrplot"))){install.packages("corrplot")} #相关分析绘图
## Loading required package: corrplot
## corrplot 0.92 loaded
if(!isTRUE(require("semPlot"))){install.packages("semPlot")} #结构方程分析绘图
## Loading required package: semPlot
if(!isTRUE(require("leaflet"))){install.packages("leaflet")}# 地图绘制,基于openstreet数据
## Loading required package: leaflet
#library(Hmisc) #数据分析包
#library(psych) #数据分析包
#library(broom) # 分析结果整洁输出
#library(leaflet) # 地图绘制,基于openstreet数据
#library(ggplot2) #专业绘图
#library(ggrepel) # 绘图的标注(不遮挡)
#library(lavaan) #结构方程分析
#library(semPlot) #结构方程分析绘图
#library(corrplot)# 相关系数图

1.2 import dataset导入数据

1.2a设定工作文件夹

#设定工作文件夹, 根据自己电脑的情况进行修改
datafolder<-'D:/tdata/tourist_cn_R/'



#文件夹的设定,注意斜线的方向
#datafolder<-'D:\\tdata\\tourist_cn_R\\'


#数据可以通过网盘直链下载
# http://kaiwu.city/openfiles/tourist.csv 
#373条记录(被调查者),42个变量
#数据下载后存入datafolder

1.2b导入csv格式数据

数据可以通过网盘直链下载 http://kaiwu.city/openfiles/tourist.csv 373条记录(被调查者),42个变量 数据下载后存入datafolder

read.csv是常见的读入数据方法(csv格式,以逗号为分隔符的数据集)

tourist<-read.csv('http://kaiwu.city/openfiles/tourist.csv',header=TRUE,encoding = 'UTF-8')



#tourist<-read.csv(paste(datafolder,'tourist.csv',sep=""),header=TRUE,encoding = 'UTF-8')

#tourist<-read.csv('D:/tdata/tourist_cn_R/tourist.csv',header=TRUE,encoding = 'UTF-8')


#3种相同效果的文件名及路径
#第1种:'D:/pdata/rdata/tourist_cn/tourist.csv'
#第2种:paste(datafolder,'tourist.csv',sep="")
#第3种:paste0(datafolder,'tourist.csv')
#注意第3种paste0,最后1位是数字0,不是字母。


#header表示表头——数据中包含变量名的,header==TRUE,否则header==FALSE
#encoding = 'UTF-8'是csv文件的编码格式,UTF-8支持中文等多种格式,但是注意excel另存为csv时,默认是ANSI格式,需要注意设置一下。

#head 表示呈现前6行数据,tail显示后6行数据
head(tourist)
##      sid gender byear region income expense type3 type2 thotel sat1 sat2 sat3
## 1 rec001      1  1971      1   2708     432     3     2      3    4    2    2
## 2 rec002      1  1995      1   1884     238     2     1      3    5    5    4
## 3 rec003      1  1990      2   2458     399     3     2      3    3    4    5
## 4 rec004      2  1970      2   2726     245     2     1      1    5    2    3
## 5 rec005      1  1964      4   3084     287     2     2      2    5    3    5
## 6 rec006      1  1965      5   2184     216     3     1      3    4    3    3
##   sat4 sat5 sat6 ri1 ri2 ri3 ri4 ri5 rp1 rp2 rp3 rp4 rp5 te1 te2 te3 te4 te5
## 1    3    4    4   4   4   4   5   4   4   5   4   5   4   4   4   4   4   4
## 2    5    3    5   4   3   4   4   5   4   3   4   4   5   5   5   4   4   4
## 3    3    3    2   5   5   5   5   5   5   5   4   5   5   3   4   4   4   4
## 4    5    4    4   4   5   5   5   5   4   5   5   5   5   4   4   4   4   5
## 5    2    4    5   5   5   5   5   5   5   5   5   5   5   4   5   4   4   5
## 6    2    5    3   4   4   5   5   5   4   4   5   5   5   1   3   3   3   1
##   te6 te7 te8 zh1 zh2 zh3 zh4 zh5 zh6 zh7 latitude longitude
## 1   4   4   4   3   1   1   2   3   2   2 31.24039  121.6653
## 2   5   3   4   4   4   5   3   5   5   5 31.20866  121.4604
## 3   4   4   4   1   1   2   2   1   3   1 31.10409  121.3958
## 4   4   2   4   3   3   3   3   2   2   2 31.06726  121.1696
## 5   5   4   5   1   1   1   4   2   1   1 31.17709  121.6795
## 6   1   3   2   4   3   3   4   2   4   4 31.18326  121.4021
#tail(tourist)

1.2c 练习1 导入csv格式数据

# 练习1 : 下载数据,试着用本地磁盘文件,修改datafolder目录,读入数据。
#tourist<-read.csv('http://kaiwu.city/openfiles/tourist.csv',header=TRUE,encoding = 'UTF-8')

#tourist<-read.csv(paste(datafolder,'tourist.csv',sep=""),header=TRUE,encoding = 'UTF-8')

1.2d导入SPSS格式数据

SPSS格式数据下载网址 http://kaiwu.city/openfiles/data_tourist_cn.sav 注意要安装haven拓展功能包,才可以导入

#导入SPSS格式数据也可以,需要haven
#if(!isTRUE(require("haven"))){install.packages("haven")}
#tourist_spss<-read_sav(url("http://kaiwu.city/openfiles/data_tourist_cn.sav"))

#tourist_spss<-read_sav(paste0(datafolder,'data_tourist_satisfaction_cnlabels.sav'))
#head(tourist_spss)


#head(tourist_spss)

1.2e导入excel格式数据

如果想直接导入excel格式数据(xls或xlsx),推荐使XLConnect XLConnect可以实现Excel与R的数据双向交互

连接汇报用的excel文件 参考博文 https://www.r-bloggers.com/2016/03/few-steps-to-connect-r-with-excel-xlconnect-2/ http://www.sthda.com/english/articles/2-r/4-xlconnect-read-write-and-manipulate-microsoft-excel-files-from-within-r/

#导入Excel格式数据也可以,需要readxl

#if(!isTRUE(require("readxl"))){install.packages("readxl")}

#tourist_excel<-read_excel(paste0(datafolder,"data_tourist_satisfaction_en.xlsx"),sheet="data")

#导入Excel格式数据也可以,需要openxlsx

#if(!isTRUE(require("openxlsx"))){install.packages("openxlsx")}

#tourist_excel<- read.xlsx("http://kaiwu.city/openfiles/tourist_cn.xlsx",sheet="data")


#tourist_excel<- read.xlsx(paste0(datafolder,"data_tourist_satisfaction_en.xlsx"),sheet="data")

1.3 value labels变量值标签

1.3a 变量值标签:示例

#value labels变量值标签,参考电子问卷
#   http://kaiwu.city/openfiles/tourist_satisfaction_questionnaire_cn.docx



#tourist是数据框dataframe的名字,gendr是变量的名字,tourist$gender就是调用tourist数据框的gender变量

#等同的效果是tourist[,"gender"]或tourist[,2]

#factor是R的一种数据形式,可以简单理解为分类变量,注意这个levels是可以有顺序的。

tourist$gender <- factor(tourist$gender,levels = c(1,2),labels = c("男","女"))


tourist$thotel <- factor(tourist$thotel,levels = c(1,2,3,4),labels = c("经济型酒店", "豪华型酒店", "民宿", "酒店式公寓"))

tourist$type3 <- factor(tourist$type3,levels = c(1,2,3),labels = c("自然风光", "历史文化", "自然与历史混合"))

tourist$type2 <- factor(tourist$type2,levels = c(1,2),labels = c("观赏型", "参与型"))

head(tourist)
##      sid gender byear region income expense          type3  type2     thotel
## 1 rec001     男  1971      1   2708     432 自然与历史混合 参与型       民宿
## 2 rec002     男  1995      1   1884     238       历史文化 观赏型       民宿
## 3 rec003     男  1990      2   2458     399 自然与历史混合 参与型       民宿
## 4 rec004     女  1970      2   2726     245       历史文化 观赏型 经济型酒店
## 5 rec005     男  1964      4   3084     287       历史文化 参与型 豪华型酒店
## 6 rec006     男  1965      5   2184     216 自然与历史混合 观赏型       民宿
##   sat1 sat2 sat3 sat4 sat5 sat6 ri1 ri2 ri3 ri4 ri5 rp1 rp2 rp3 rp4 rp5 te1 te2
## 1    4    2    2    3    4    4   4   4   4   5   4   4   5   4   5   4   4   4
## 2    5    5    4    5    3    5   4   3   4   4   5   4   3   4   4   5   5   5
## 3    3    4    5    3    3    2   5   5   5   5   5   5   5   4   5   5   3   4
## 4    5    2    3    5    4    4   4   5   5   5   5   4   5   5   5   5   4   4
## 5    5    3    5    2    4    5   5   5   5   5   5   5   5   5   5   5   4   5
## 6    4    3    3    2    5    3   4   4   5   5   5   4   4   5   5   5   1   3
##   te3 te4 te5 te6 te7 te8 zh1 zh2 zh3 zh4 zh5 zh6 zh7 latitude longitude
## 1   4   4   4   4   4   4   3   1   1   2   3   2   2 31.24039  121.6653
## 2   4   4   4   5   3   4   4   4   5   3   5   5   5 31.20866  121.4604
## 3   4   4   4   4   4   4   1   1   2   2   1   3   1 31.10409  121.3958
## 4   4   4   5   4   2   4   3   3   3   3   2   2   2 31.06726  121.1696
## 5   4   4   5   5   4   5   1   1   1   4   2   1   1 31.17709  121.6795
## 6   3   3   1   1   3   2   4   3   3   4   2   4   4 31.18326  121.4021

1.3b 练习2 变量值标签

##### 练习2
##### region(区域)变量的变量值标签。
##### 1 2   3   4   5   6   7
##### 华中    华东  华北  东北  西北  西南  华西
##### 练习2的答案


tourist$region <- factor(tourist$region,levels = c(1,2,3,4,5,6,7),labels = c('华中',  '华东',   '华北',   '东北',   '西北',   '西南',   '华西'))

head(tourist)
##      sid gender byear region income expense          type3  type2     thotel
## 1 rec001     男  1971   华中   2708     432 自然与历史混合 参与型       民宿
## 2 rec002     男  1995   华中   1884     238       历史文化 观赏型       民宿
## 3 rec003     男  1990   华东   2458     399 自然与历史混合 参与型       民宿
## 4 rec004     女  1970   华东   2726     245       历史文化 观赏型 经济型酒店
## 5 rec005     男  1964   东北   3084     287       历史文化 参与型 豪华型酒店
## 6 rec006     男  1965   西北   2184     216 自然与历史混合 观赏型       民宿
##   sat1 sat2 sat3 sat4 sat5 sat6 ri1 ri2 ri3 ri4 ri5 rp1 rp2 rp3 rp4 rp5 te1 te2
## 1    4    2    2    3    4    4   4   4   4   5   4   4   5   4   5   4   4   4
## 2    5    5    4    5    3    5   4   3   4   4   5   4   3   4   4   5   5   5
## 3    3    4    5    3    3    2   5   5   5   5   5   5   5   4   5   5   3   4
## 4    5    2    3    5    4    4   4   5   5   5   5   4   5   5   5   5   4   4
## 5    5    3    5    2    4    5   5   5   5   5   5   5   5   5   5   5   4   5
## 6    4    3    3    2    5    3   4   4   5   5   5   4   4   5   5   5   1   3
##   te3 te4 te5 te6 te7 te8 zh1 zh2 zh3 zh4 zh5 zh6 zh7 latitude longitude
## 1   4   4   4   4   4   4   3   1   1   2   3   2   2 31.24039  121.6653
## 2   4   4   4   5   3   4   4   4   5   3   5   5   5 31.20866  121.4604
## 3   4   4   4   4   4   4   1   1   2   2   1   3   1 31.10409  121.3958
## 4   4   4   5   4   2   4   3   3   3   3   2   2   2 31.06726  121.1696
## 5   4   4   5   5   4   5   1   1   1   4   2   1   1 31.17709  121.6795
## 6   3   3   1   1   3   2   4   3   3   4   2   4   4 31.18326  121.4021

1.4 compute variables 计算变量

1.4a 计算变量:示例

计算变量,compute variables与recode是两个思路,但是有时可以互相替换。

#计算总体满意度sat
tourist<-transform(tourist,sat=(sat1+sat2+sat3+sat4+sat5+sat6)/6)

summary(tourist$sat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.667   3.667   4.000   3.994   4.333   5.000

1.4b 练习3 计算变量

#compute variables
# 练习3 : 基于byear(出生年份),计算age变量(年龄)。
# 2022- byear
#练习3答案
#练习3 : 基于byear(出生年份),计算age变量(年龄)。
# 方案1:transform
tourist<-transform(tourist,age=2023-byear)

# 方案2:直接计算
# tourist$age<-2022-tourist$byear

summary(tourist$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   30.00   45.00   44.12   58.00   71.00

1.5 recode 重新编码

1.5a recode:收入段

attach和detach配合,可以减少tourist$的使用 []里面是条件

# recode 重新编码:income3收入段变量——分3段
attach(tourist)
tourist$income3[income  < 2000] <- 1
tourist$income3[income >= 2000 & income  <3000] <- 2
tourist$income3[income >= 3000] <- 3
detach(tourist)

# 加变量值标签
tourist$income3 <- factor(tourist$income3,levels = c(1,2,3),labels = c("2000以下", "2000-3000", "3000以上"))


summary(tourist$income3)
##  2000以下 2000-3000  3000以上 
##        76       191       106

1.5b 练习4 recode

#compute variables
# 练习4 : 把收入变量income分为5段,2000以下,2000-2499,2500-2999,3000-3499,3500以上
# 练习4答案
# 练习4 : 把收入变量income分为5段,2000以下,2000-2499,2500-2999,3000-3499,3500以上

attach(tourist)
tourist$income5[income  < 2000] <- 1
tourist$income5[income >= 2000 & income  <2500] <- 2
tourist$income5[income >= 2500 & income  <3000] <- 3
tourist$income5[income >= 3000 & income  <3500] <- 4
tourist$income5[income >= 3500] <- 5
detach(tourist)

# 加变量值标签
tourist$income5 <- factor(tourist$income5,levels = c(1,2,3,4,5),labels = c("2000以下", "2000-2499","2500-2999","3000-3499", "3500以上"))


summary(tourist$income5)
##  2000以下 2000-2499 2500-2999 3000-3499  3500以上 
##        76        80       111        72        34

1.5c recode:旅游支出分段

# recode 重新编码:旅游花费变量——分3段
attach(tourist)
tourist$expense3[expense  < 300] <- 1
tourist$expense3[expense >= 300 & expense  < 400] <- 2
tourist$expense3[expense > 400] <- 3
detach(tourist)


# 加变量值标签
tourist$expense3 <- factor(tourist$expense3,levels = c(1,2,3),labels = c("300以下", "300-399", "400以上"))
summary(tourist$expense3)
## 300以下 300-399 400以上 
##     152     144      77

1.6 summary()

快速浏览、汇总(最大值、最小值、数据类型、变量值标签) 有利于发现异常值

summary(tourist)
##      sid            gender       byear       region       income    
##  Length:373         男:214   Min.   :1952   华中:62   Min.   :1133  
##  Class :character   女:159   1st Qu.:1965   华东:60   1st Qu.:2121  
##  Mode  :character            Median :1978   华北:62   Median :2654  
##                              Mean   :1979   东北:45   Mean   :2590  
##                              3rd Qu.:1993   西北:48   3rd Qu.:3084  
##                              Max.   :2005   西南:57   Max.   :3773  
##                                             华西:39                 
##     expense                 type3        type2            thotel   
##  Min.   :199.0   自然风光      : 58   观赏型:178   经济型酒店:163  
##  1st Qu.:262.0   历史文化      :132   参与型:195   豪华型酒店: 35  
##  Median :326.0   自然与历史混合:183                民宿      :121  
##  Mean   :330.6                                     酒店式公寓: 54  
##  3rd Qu.:391.0                                                     
##  Max.   :556.0                                                     
##                                                                    
##       sat1            sat2            sat3            sat4      
##  Min.   :3.000   Min.   :2.000   Min.   :1.000   Min.   :2.000  
##  1st Qu.:4.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:4.000  
##  Median :5.000   Median :4.000   Median :4.000   Median :4.000  
##  Mean   :4.349   Mean   :3.743   Mean   :3.676   Mean   :4.214  
##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##       sat5            sat6            ri1             ri2       
##  Min.   :3.000   Min.   :2.000   Min.   :3.000   Min.   :3.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:4.000   1st Qu.:4.000  
##  Median :4.000   Median :4.000   Median :4.000   Median :5.000  
##  Mean   :4.097   Mean   :3.887   Mean   :4.351   Mean   :4.491  
##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##       ri3             ri4             ri5             rp1       
##  Min.   :3.000   Min.   :3.000   Min.   :3.000   Min.   :3.000  
##  1st Qu.:4.000   1st Qu.:5.000   1st Qu.:5.000   1st Qu.:4.000  
##  Median :5.000   Median :5.000   Median :5.000   Median :4.000  
##  Mean   :4.499   Mean   :4.708   Mean   :4.727   Mean   :4.316  
##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##       rp2             rp3             rp4             rp5       
##  Min.   :3.000   Min.   :2.000   Min.   :3.000   Min.   :3.000  
##  1st Qu.:4.000   1st Qu.:4.000   1st Qu.:5.000   1st Qu.:4.000  
##  Median :5.000   Median :5.000   Median :5.000   Median :5.000  
##  Mean   :4.515   Mean   :4.383   Mean   :4.721   Mean   :4.643  
##  3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##       te1             te2             te3             te4       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:3.000   1st Qu.:4.000   1st Qu.:4.000  
##  Median :4.000   Median :4.000   Median :4.000   Median :4.000  
##  Mean   :3.912   Mean   :3.791   Mean   :4.131   Mean   :4.078  
##  3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:5.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##       te5             te6             te7             te8       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :4.000   Median :4.000   Median :4.000   Median :4.000  
##  Mean   :3.756   Mean   :3.898   Mean   :3.729   Mean   :3.657  
##  3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##       zh1             zh2             zh3             zh4       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :3.000   Median :3.000   Median :3.000   Median :2.000  
##  Mean   :2.836   Mean   :2.627   Mean   :2.702   Mean   :2.552  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##                                                                 
##       zh5             zh6             zh7           latitude    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :30.79  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:31.15  
##  Median :3.000   Median :3.000   Median :3.000   Median :31.20  
##  Mean   :2.598   Mean   :2.528   Mean   :2.555   Mean   :31.21  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:31.23  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :31.72  
##                                                                 
##    longitude          sat             age             income3   
##  Min.   :121.0   Min.   :2.667   Min.   :18.00   2000以下 : 76  
##  1st Qu.:121.4   1st Qu.:3.667   1st Qu.:30.00   2000-3000:191  
##  Median :121.5   Median :4.000   Median :45.00   3000以上 :106  
##  Mean   :121.5   Mean   :3.994   Mean   :44.12                  
##  3rd Qu.:121.7   3rd Qu.:4.333   3rd Qu.:58.00                  
##  Max.   :121.9   Max.   :5.000   Max.   :71.00                  
##                                                                 
##       income5       expense3  
##  2000以下 : 76   300以下:152  
##  2000-2499: 80   300-399:144  
##  2500-2999:111   400以上: 77  
##  3000-3499: 72                
##  3500以上 : 34                
##                               
## 

1.7 数据的分布情况

数据分布对于统计方法选择,统计检验非常重要 为了提高运行效率,使用R默认的绘图工具,没有用ggplot2

1.7a 数据分布:示例

#概率分布图

plot(density(tourist$sat1))

#main=""是图标题
#xlab='总体满意度'设置x轴标题
#ylab='频数'设置y轴标题

plot(density(tourist$sat1),main="", xlab='景区满意度', ylab='频数')

hist(tourist$sat1)

hist(tourist$sat1,main="", xlab='景区满意度', ylab='频数')

1.7b 练习5 数据分布

#查看概率分布
# 练习5 : 查看收入变量income的分布。
# 练习5答案
# 练习5 : 查看收入变量income的分布。
hist(tourist$income,main="", xlab='收入', ylab='频数')

1.7c 用循环语句查看多个变量的分布:密度图

程序设计可以提高批处理的效率,但是我们不做要求,感兴趣同学自己研究一下

#10是变量所在的列的序号
#for (i in 10:40){
#   plot(density(tourist[,i]))
#   plot(density(tourist[,i]),main="", xlab=names(tourist[i]))
#}

1.7d 用循环语句查看多个变量的分布:柱形图

#for (i in 10:40){
 #  hist(tourist[,i],main="", xlab=names(tourist[i]))
  
   #带频数值标签的柱形图
 #  h<-hist(tourist[,i],plot=FALSE)

 # hist(tourist[,i],main="", xlab=names(tourist[i]),labels = TRUE,ylim=c(0,1.1*max(h$counts)))

  # 关于柱形图的标注问题,参考如下网址
  # https://stackoverflow.com/questions/9317948/how-to-label-histogram-bars-with-data-values-or-percents-in-r  
#}

1.8删除一个变量

#产生一个新变量
tourist$sat2023<-tourist$sat1/5

删除使用NULL

#删除这个刚生成的变量
tourist$sat2023<-NULL

1.9 save dataset

保存为rda格式数据,以后可以反复调用,注意从2.开始,不用把1.部分重复一遍。

save(tourist,file=paste0(datafolder,"tourist_cn.rda"))
#save(tourist,file=paste(datafolder,"tourist.rda",sep=""))
#输出为csv文件
#write.csv(tourist,paste(datafolder,'tourist2023cn.csv',sep=""),fileEncoding = "GBK")

2. basic analysis基础分析

#设定工作文件夹, 根据自己电脑的情况进行修改
datafolder<-'D:/tdata/tourist_cn_R/'

load是用于载入保存了的rda格式数据

load(paste0(datafolder,'tourist_cn.rda'))

2.1 frequency analysis频数分析

频数分析 Freq_gender Freq_thotel_gender

FreqGender FreqThotelGender

#性别的频数表
Freq_gender<-table(tourist$gender)
Freq_gender
## 
##  男  女 
## 214 159
#性别的百分比
prop.table(Freq_gender)
## 
##        男        女 
## 0.5737265 0.4262735
#保留小数点后2位数字
round(prop.table(Freq_gender)*100,2)
## 
##    男    女 
## 57.37 42.63

为了输出更整洁一点,可以转换为数据框data frame

#性别的频数表


#频数表gender_count为数据框data frame
gender_count<-as.data.frame(table(tourist$gender))


#频数表gender_percent为数据框data frame
gender_percent<-as.data.frame(round(prop.table(table(tourist$gender)),4))

#合并gender_count和gender_percent
gender_Freq<-cbind(gender_count,gender_percent$Freq)

#更改变量名
colnames(gender_Freq)<-c('性别','频数','Percent')

print(gender_Freq)
##   性别 频数 Percent
## 1   男  214  0.5737
## 2   女  159  0.4263
#输出为csv文件
#write.csv(gender_Freq,paste(datafolder,'gender_Freq.csv',sep=""),fileEncoding = "GBK")

如果用xlconnect,可以写入到特定excel格式文件

#住宿类型的频数表


#频数表thotel_count为数据框data frame
thotel_count<-as.data.frame(table(tourist$thotel))


#频数表thotel_percent为数据框data frame
thotel_percent<-as.data.frame(round(prop.table(table(tourist$thotel)),4))

#合并thotel_count和thotel_percent
thotel_Freq<-cbind(thotel_count,thotel_percent$Freq)

#更改变量名
colnames(thotel_Freq)<-c('thotel','Freq','Percent')

print(thotel_Freq)
##       thotel Freq Percent
## 1 经济型酒店  163  0.4370
## 2 豪华型酒店   35  0.0938
## 3       民宿  121  0.3244
## 4 酒店式公寓   54  0.1448
#输出为csv文件
#write.csv(thotel_Freq,paste(datafolder,'thotel_Freq.csv',sep=""),fileEncoding = "GBK")

2.1a 练习6 :频数分析

##### 练习6 : 对region变量,进行频数分析。
#区域的频数表
##### 练习6答案 : 对region变量,进行频数分析。

#频数表region_count为数据框data frame
region_count<-as.data.frame(table(tourist$region))


#百分表region_percent为数据框data frame
region_percent<-as.data.frame(round(prop.table(table(tourist$region)),4))

#合并region_count和region_percent
region_Freq<-cbind(region_count,region_percent$Freq)

#更改变量名
colnames(region_Freq)<-c('区域','频数','百分比')

print(region_Freq)
##   区域 频数 百分比
## 1 华中   62 0.1662
## 2 华东   60 0.1609
## 3 华北   62 0.1662
## 4 东北   45 0.1206
## 5 西北   48 0.1287
## 6 西南   57 0.1528
## 7 华西   39 0.1046
#输出为csv文件
#write.csv(region_Freq,paste(datafolder,'region_Freq.csv',sep=""),fileEncoding = "GBK")

2.1b 连续变量的柱形图

hist(tourist$byear)

  # 带频数值标签的柱形图
h2<-hist(tourist$byear,plot=FALSE)
hist(tourist$byear,main="", xlab='出生年份')
hist(tourist$byear,main="", xlab='出生年份',labels = TRUE)

hist(tourist$byear,main="", xlab='出生年份',labels = TRUE,ylim=c(0,50))

hist(tourist$byear,main="", xlab='出生年份',labels = TRUE,ylim=c(0,50),col='lightblue')

hist(tourist$byear,main="", xlab='出生年份',labels = TRUE,ylim=c(0,50),col='cyan4')

#html color codes
#https://html-color.codes/
# r colors names
# https://r-charts.com/colors/

2.1c 练习7 age4的频数分析

#### 综合一点的练习

##### 练习7 : 
##### step1 参考1.4 根据byear,计算生成年龄变量age
##### step2 参考1.7 查看年龄变量age的分布
##### step3 参考1.5 把连续变量age,转换为年龄段变量,例如age4(20岁以下,20-40,40-60,60岁以上)
##### step4 参考2.1 对age4变量进行频数分析

练习7的答案

# 练习7 答案
age<-2023-tourist$byear
hist(age)

attach(tourist)
## The following object is masked _by_ .GlobalEnv:
## 
##     age
tourist$age4[age  < 20] <- 1
tourist$age4[age >= 20 & age < 40] <- 2
tourist$age4[age >= 40 & age < 60] <- 3
tourist$age4[age >= 60] <- 4
detach(tourist)

tourist$age4 <- factor(tourist$age4,levels = c(1,2,3,4),labels = c("20岁以下", "20-40", "40-60","60岁以上"))

t3<-table(tourist$age4)
t3
## 
## 20岁以下    20-40    40-60 60岁以上 
##       15      132      145       81
round(prop.table(t3)*100,2)
## 
## 20岁以下    20-40    40-60 60岁以上 
##     4.02    35.39    38.87    21.72

2.2 crosstable and chi-square test

列联表与卡方检验

### crosstable 
### 推荐的命名是这样的ct_thotel_gender,但是可以简化使用ct1
### ct_thotel_gender的好处是:(1)了解列联表涉及的变量;(2)方便查找替换

ct1<-table(tourist$thotel,tourist$gender)
prop.table(ct1)
##             
##                      男         女
##   经济型酒店 0.23592493 0.20107239
##   豪华型酒店 0.05898123 0.03485255
##   民宿       0.20375335 0.12064343
##   酒店式公寓 0.07506702 0.06970509
prop.table(ct1,1)
##             
##                     男        女
##   经济型酒店 0.5398773 0.4601227
##   豪华型酒店 0.6285714 0.3714286
##   民宿       0.6280992 0.3719008
##   酒店式公寓 0.5185185 0.4814815
prop.table(ct1,2)
##             
##                      男         女
##   经济型酒店 0.41121495 0.47169811
##   豪华型酒店 0.10280374 0.08176101
##   民宿       0.35514019 0.28301887
##   酒店式公寓 0.13084112 0.16352201
round(prop.table(ct1,1)*100,2)
##             
##                 男    女
##   经济型酒店 53.99 46.01
##   豪华型酒店 62.86 37.14
##   民宿       62.81 37.19
##   酒店式公寓 51.85 48.15
# summary 输出卡方检验结果chi-square test
summary(ct1)
## Number of cases in table: 373 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 3.33, df = 3, p-value = 0.3435

为了输出更整洁一点,可以转换为数据框data frame

#
#频数表ct1为数据框data frame
### 推荐的命名是这样的ct_thotel_gender,但是可以简化使用ct_thotel_gender
### ct_thotel_gender的好处是:(1)了解列联表涉及的变量;(2)方便查找替换

Freq_thotel_gender<-as.data.frame(table(tourist$thotel,tourist$gender))


#频数表ct_thotel_gender为数据框data frame
Percent_thotel_gender<-as.data.frame(round(prop.table(table(tourist$thotel,tourist$gender),1)*100,2))


#合并Freq_thotel_gender和Percent_thotel_gender
ct_thotel_gender<-cbind(Freq_thotel_gender,Percent_thotel_gender$Freq)

#更改变量名
colnames(ct_thotel_gender)<-c('住宿类型','性别','频数','百分比')

print(ct_thotel_gender)
##     住宿类型 性别 频数 百分比
## 1 经济型酒店   男   88  53.99
## 2 豪华型酒店   男   22  62.86
## 3       民宿   男   76  62.81
## 4 酒店式公寓   男   28  51.85
## 5 经济型酒店   女   75  46.01
## 6 豪华型酒店   女   13  37.14
## 7       民宿   女   45  37.19
## 8 酒店式公寓   女   26  48.15
#输出为csv文件
#write.csv(ct_thotel_gender,paste(datafolder,'tourist_hotel.csv',sep=""))

练习8 卡方检验

##### 练习8 : 利用2.2的卡方检验,检验如下研究假设
##### H1 收入income3与酒店类型thotel有关

2.3 independent T-test

t.test(tourist$sat1~tourist$gender)
## 
##  Welch Two Sample t-test
## 
## data:  tourist$sat1 by tourist$gender
## t = -3.2406, df = 351.03, p-value = 0.001307
## alternative hypothesis: true difference in means between group 男 and group 女 is not equal to 0
## 95 percent confidence interval:
##  -0.39782875 -0.09732202
## sample estimates:
## mean in group 男 mean in group 女 
##         4.242991         4.490566

更整洁的表格,需要broom包

ttest_sat1_gender<-t.test(tourist$sat1~tourist$gender)
tidy(ttest_sat1_gender)
## # A tibble: 1 × 10
##   estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
##      <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>     <dbl>
## 1   -0.248      4.24      4.49     -3.24 0.00131      351.   -0.398   -0.0973
## # ℹ 2 more variables: method <chr>, alternative <chr>

6个满意度与性别关系的T检验,合并一起。

# 计算6个t-test
# 第10-15列是满意度变量sat1-sat6,所以tourist[,10:15]与tourist[,c('sat1','sat2','sat3','sat4','sat5','sat6')]效果相同,直接引用变量名,程序容易理解,但是写起来麻烦

ttest_sat <- purrr::map(tourist[,10:15], ~t.test(.x ~ tourist$gender))

#整洁输出
report_ttest_sat<-data.frame(tidy(ttest_sat[[1]]))[1,]

for(i in 2:6){
  report_ttest_sat<-rbind(report_ttest_sat,data.frame(tidy(ttest_sat[[i]]))[1,])
}

# 增加变量名

for(i in 1:6){
  report_ttest_sat[i,1]<-names(tourist[9+i])
}

colnames(report_ttest_sat)<-c('变量','男','女','t','p.value','parameter','conf.low','conf.high','method','alternative')
report_ttest_sat
##   变量       男       女         t      p.value parameter   conf.low
## 1 sat1 4.242991 4.490566 -3.240644 1.306688e-03  351.0320 -0.3978288
## 2 sat2 3.654206 3.861635 -1.891915 5.930154e-02  361.3938 -0.4230422
## 3 sat3 3.500000 3.911950 -3.412197 7.175577e-04  360.5481 -0.6493703
## 4 sat4 4.028037 4.465409 -4.914159 1.340541e-06  370.9749 -0.6123837
## 5 sat5 4.046729 4.163522 -1.411346 1.590744e-01  333.6024 -0.2795764
## 6 sat6 3.822430 3.974843 -1.424272 1.552309e-01  361.2393 -0.3628557
##      conf.high                  method alternative
## 1 -0.097322017 Welch Two Sample t-test   two.sided
## 2  0.008183002 Welch Two Sample t-test   two.sided
## 3 -0.174529052 Welch Two Sample t-test   two.sided
## 4 -0.262359174 Welch Two Sample t-test   two.sided
## 5  0.045990279 Welch Two Sample t-test   two.sided
## 6  0.058029964 Welch Two Sample t-test   two.sided
# 输出分析结果
write.csv(report_ttest_sat,paste(datafolder,'report_ttest_sat.csv',sep=""),fileEncoding = "GBK")

练习9 卡方检验

##### 练习9 : 利用2.3的T检验,检验如下研究假设
##### H3 女士比男士(gender)对景区的总体满意度(sat)更高

2.4 ANOVA

单因素方差分析

aov_income3<-aov(sat1~income3,tourist)
summary(aov_income3)
##              Df Sum Sq Mean Sq F value Pr(>F)
## income3       2   0.72  0.3594   0.646  0.525
## Residuals   370 205.97  0.5567
tidy(aov_income3)
## # A tibble: 2 × 6
##   term         df   sumsq meansq statistic p.value
##   <chr>     <dbl>   <dbl>  <dbl>     <dbl>   <dbl>
## 1 income3       2   0.719  0.359     0.646   0.525
## 2 Residuals   370 206.     0.557    NA      NA

2.5 correlation

相关分析

#两个变量关系的散点图
plot(tourist$income,tourist$te)

cor.test(tourist$income,tourist$expense)
## 
##  Pearson's product-moment correlation
## 
## data:  tourist$income and tourist$expense
## t = 9.5162, df = 371, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3574822 0.5210526
## sample estimates:
##       cor 
## 0.4429459
cor(tourist$income,tourist$expense,method="spearman")
## [1] 0.4027449

更加整洁的输出

c1<-cor.test(tourist$income,tourist$expense)
tidy(c1)
## # A tibble: 1 × 8
##   estimate statistic  p.value parameter conf.low conf.high method    alternative
##      <dbl>     <dbl>    <dbl>     <int>    <dbl>     <dbl> <chr>     <chr>      
## 1    0.443      9.52 2.35e-19       371    0.357     0.521 Pearson'… two.sided

相关系数矩阵

# 第5列是income,第6列是expense,第10-15列是满意度变量sat1-sat6
cor_data<-tourist[,c(5,6,10:15)]
mydata.cor = cor(cor_data)
mydata.cor
##              income      expense         sat1        sat2        sat3
## income   1.00000000  0.442945941  0.026740705 -0.01573717 -0.03478183
## expense  0.44294594  1.000000000  0.002191373 -0.02832285 -0.09644656
## sat1     0.02674071  0.002191373  1.000000000  0.19670856  0.10043646
## sat2    -0.01573717 -0.028322850  0.196708559  1.00000000 -0.04452174
## sat3    -0.03478183 -0.096446564  0.100436456 -0.04452174  1.00000000
## sat4     0.01931803 -0.088759307  0.044004353  0.06493726  0.04694267
## sat5    -0.03642428 -0.060271195 -0.039155116  0.09324654  0.09942254
## sat6    -0.04216386  0.073107112  0.054031745 -0.01636401 -0.11162753
##                sat4        sat5        sat6
## income   0.01931803 -0.03642428 -0.04216386
## expense -0.08875931 -0.06027119  0.07310711
## sat1     0.04400435 -0.03915512  0.05403174
## sat2     0.06493726  0.09324654 -0.01636401
## sat3     0.04694267  0.09942254 -0.11162753
## sat4     1.00000000 -0.09264920  0.10743228
## sat5    -0.09264920  1.00000000  0.02634869
## sat6     0.10743228  0.02634869  1.00000000

rcorr来自Hmisc包

mydata.rcorr <- rcorr(as.matrix(cor_data))
mydata.coeff <- mydata.rcorr$r
mydata.coeff 
##              income      expense         sat1        sat2        sat3
## income   1.00000000  0.442945941  0.026740705 -0.01573717 -0.03478183
## expense  0.44294594  1.000000000  0.002191373 -0.02832285 -0.09644656
## sat1     0.02674071  0.002191373  1.000000000  0.19670856  0.10043646
## sat2    -0.01573717 -0.028322850  0.196708559  1.00000000 -0.04452174
## sat3    -0.03478183 -0.096446564  0.100436456 -0.04452174  1.00000000
## sat4     0.01931803 -0.088759307  0.044004353  0.06493726  0.04694267
## sat5    -0.03642428 -0.060271195 -0.039155116  0.09324654  0.09942254
## sat6    -0.04216386  0.073107112  0.054031745 -0.01636401 -0.11162753
##                sat4        sat5        sat6
## income   0.01931803 -0.03642428 -0.04216386
## expense -0.08875931 -0.06027119  0.07310711
## sat1     0.04400435 -0.03915512  0.05403174
## sat2     0.06493726  0.09324654 -0.01636401
## sat3     0.04694267  0.09942254 -0.11162753
## sat4     1.00000000 -0.09264920  0.10743228
## sat5    -0.09264920  1.00000000  0.02634869
## sat6     0.10743228  0.02634869  1.00000000
mydata.p <- mydata.rcorr$P
mydata.p
##            income    expense         sat1         sat2       sat3       sat4
## income         NA 0.00000000 0.6066875674 0.7619402861 0.50305111 0.70998546
## expense 0.0000000         NA 0.9663548597 0.5855619683 0.06277516 0.08692367
## sat1    0.6066876 0.96635486           NA 0.0001314393 0.05260579 0.39675922
## sat2    0.7619403 0.58556197 0.0001314393           NA 0.39122559 0.21084024
## sat3    0.5030511 0.06277516 0.0526057891 0.3912255868         NA 0.36595875
## sat4    0.7099855 0.08692367 0.3967592185 0.2108402357 0.36595875         NA
## sat5    0.4830900 0.24556908 0.4508720781 0.0720556601 0.05505029 0.07390625
## sat6    0.4168218 0.15881152 0.2979756738 0.7527602339 0.03113131 0.03808825
##               sat5       sat6
## income  0.48309000 0.41682182
## expense 0.24556908 0.15881152
## sat1    0.45087208 0.29797567
## sat2    0.07205566 0.75276023
## sat3    0.05505029 0.03113131
## sat4    0.07390625 0.03808825
## sat5            NA 0.61197396
## sat6    0.61197396         NA

tidy report

#rcorr来自Hmisc包
tidy(mydata.rcorr)
## # A tibble: 28 × 5
##    column1 column2 estimate     n  p.value
##    <chr>   <chr>      <dbl> <int>    <dbl>
##  1 expense income   0.443     373 0       
##  2 sat1    income   0.0267    373 0.607   
##  3 sat1    expense  0.00219   373 0.966   
##  4 sat2    income  -0.0157    373 0.762   
##  5 sat2    expense -0.0283    373 0.586   
##  6 sat2    sat1     0.197     373 0.000131
##  7 sat3    income  -0.0348    373 0.503   
##  8 sat3    expense -0.0964    373 0.0628  
##  9 sat3    sat1     0.100     373 0.0526  
## 10 sat3    sat2    -0.0445    373 0.391   
## # ℹ 18 more rows

相关系数矩阵图

corrplot(mydata.cor)

3.ggplot2绘图

设定工作目录

datafolder<-'D:/tdata/tourist_cn_R/'

加载数据

load(paste0(datafolder,'tourist_cn.rda'))

3.1 qplot快捷做图

qplot(tourist$income,tourist$expense)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## 3.2 标准的ggplot2 散点图

3.2a 散点图的基本框架geom_point

ggplot(tourist,aes(x=income,y=expense))+
  geom_point(size=1)

#size是散点图总点的大小

3.2b 散点图的分组着色

基于gender给数据点着色

ggplot(tourist,aes(x=income,y=expense))+
  geom_point(size=2,aes(colour=gender))

type3给数据点着色

ggplot(tourist,aes(x=income,y=expense))+
  geom_point(size=2,aes(colour=type3))

### 3.2c 散点图的分面(facet)

使用type3进行分面

ggplot(tourist,aes(x=income,y=expense))+
  geom_point(size=2)+
  facet_grid(type3 ~.)

使用gender进行分面

ggplot(tourist,aes(x=income,y=expense))+
  geom_point(size=2,aes(colour=type3))+
  facet_grid(gender ~.)

使用type3,type2进行分面

3.2d 保存输出图片

#png 用于保存图片
png(paste0(datafolder,'scatexpenser_income_expense.png'))
scatexpenser_income_expense<-ggplot(tourist,aes(x=income,y=expense))+
  geom_point(size=2,aes(colour=type3))+
  facet_grid(type3 ~type2)


scatexpenser_income_expense

#print 用于保存图片
print(scatexpenser_income_expense)
dev.off()
## png 
##   2

3.3 柱形图(histogram)

3.3a 柱形图的基本框架(geom_histogram)

ggplot(tourist,aes(x=sat2,fill=gender))+
  geom_histogram(binwidth=1,colour="white")

3.3b 使用type3、type2进行分面

ggplot(tourist,aes(x=sat2,fill=type3))+
  geom_histogram(binwidth=1,colour="white")+
  facet_grid(type2 ~type3)

4. map地图

#设定工作文件夹, 根据自己电脑的情况进行修改
datafolder<-'D:/tdata/tourist_cn_R/'

load是用于载入保存了的rda格式数据

load(paste0(datafolder,'tourist_cn.rda'))

leaflet地图

# 中心位置,缩放等级
center_lon = median(tourist$longitude,na.rm = TRUE)
center_lat = median(tourist$latitude,na.rm = TRUE)

leaflet(tourist) %>% addProviderTiles("Esri.NatGeoWorldMap") %>%
  addCircles(lng = ~longitude, lat = ~latitude,radius = ~sqrt(income/10000000))  %>%
  setView(lng=center_lon, lat=center_lat,zoom = 10)

5 IPA图

IPA(importance-performance analysis)重要性-表现分析方法 IPA相对简单,但是在管理、营销领域广泛使用

Mejia, C., Bąk, M., Zientara, P., & Orlowski, M. (2022). Importance-Performance Analysis of Socially Sustainable Practices in U.s. Restaurants: A Consumer Perspective in the Quasi-Post-Pandemic Context. International Journal of Hospitality Management, 103, 103209. https://doi.org/10.1016/j.ijhm.2022.103209

#设定工作文件夹, 根据自己电脑的情况进行修改
datafolder<-'D:/tdata/tourist_cn_R/'

load是用于载入保存了的rda格式数据

load(paste0(datafolder,'tourist_cn.rda'))

5.1 data wrangling

review_label<-as.data.frame(c('评论数量',   '评论发表日期',   '评论与酒店性能关联',    '正向评价', '发布者资信度'))
colnames(review_label)<-c("label")
tourist1<-tourist[,c("ri1","ri2","ri3","ri4","ri5","rp1","rp2","rp3","rp4","rp5")]
hdata<-colMeans(tourist1)


im<-tourist[,c("ri1","ri2","ri3","ri4","ri5")]
pe<-tourist[,c("rp1","rp2","rp3","rp4","rp5")]


imdata<-as.data.frame(colMeans(im))
colnames(imdata)<-c("importance")

pedata<-as.data.frame(colMeans(pe))
colnames(pedata)<-c("performance")
ipa<-cbind(imdata,pedata,review_label)

5.2 计算x轴、y轴的取值范围

print("------------importance----------")
## [1] "------------importance----------"
print(paste("min :",round(min(ipa[1]),2)))
## [1] "min : 4.35"
print(paste("max :",round(max(ipa[1]),2)))
## [1] "max : 4.73"
print("------------performance----------")
## [1] "------------performance----------"
print(paste("min :",round(min(ipa[2]),2)))
## [1] "min : 4.32"
print(paste("max :",round(max(ipa[2]),2)))
## [1] "max : 4.72"
print(colMeans(ipa[,1:2]))
##  importance performance 
##    4.554960    4.515818

5.3 IPA图

# 基本图形架构
# xlim是x轴的区间——参考了5.2的计算结果
# ylim是x轴的区间——参考了5.2的计算结果
# labs是坐标轴的标注
# geom_vline是增加一条垂直的线,取值xintercept,颜色color,线型linetype
# geom_hline是增加一条水平的线,取值yintercept,颜色color,线型linetype
# geom_abline是增加一条斜线:y=ax+b,截距intercept,斜率slope,颜色color,粗细size
# geom_text_repel,数据点的标注问题,不重叠

ipa_chart<-ggplot(ipa,aes(x=performance,y=importance,))+
  geom_point(size=2)+ 
  xlim(4.2, 4.8)+
  ylim(4.2, 4.8)+
  labs(x = "表现",y="重要性")+
  geom_vline(xintercept=4.516,color = "red",linetype="dashed")+
  geom_hline(yintercept=4.555,color = "red",linetype="dashed")+
  geom_abline(intercept = 0, slope = 1, color="blue",size=0.5)+
  geom_text_repel(aes(label=label),size=4,hjust=0.3,vjust=0.3)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ipa_chart

#图片输出
#ggsave(paste(datafolder,'ipa_chart.png',sep=""),ipa_chart, width = 10)

6.EFA探索性因子分析

#设定工作文件夹, 根据自己电脑的情况进行修改
datafolder<-'D:/tdata/tourist_cn_R/'

load是用于载入保存了的rda格式数据

load(paste0(datafolder,'tourist_cn.rda'))

6.1 datasets数据集整理

选取变量的2种方法: (1)用列号 zhdata<-tourist[,c(34:40)]

(2)用变量名称 zhdata<-tourist[,c(‘zh1’,‘zh2’,‘zh3’,‘zh4’,‘zh5’,‘zh6’,‘zh7’)]

#satdata<-tourist[,c(10:15)]
#tedata<-tourist[,c(26:33)]
zhdata<-tourist[,c(34:40)]
#zhdata<-tourist[,c('zh1','zh2','zh3','zh4','zh5','zh6','zh7')]


#head(satdata)
#head(tedata)
head(zhdata)
##   zh1 zh2 zh3 zh4 zh5 zh6 zh7
## 1   3   1   1   2   3   2   2
## 2   4   4   5   3   5   5   5
## 3   1   1   2   2   1   3   1
## 4   3   3   3   3   2   2   2
## 5   1   1   1   4   2   1   1
## 6   4   3   3   4   2   4   4

6.2 kmo test

KMO(Kaiser-Meyer-Olkin) 检验值0-1之间,值越大越适合探索性因子分析EFA,通常至少0.7以上

kmo_test<-KMO(zhdata)
print(kmo_test, stats = c("both", "MSA", "KMO"), vars = "all", sort = FALSE, show = "all", digits = 3)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = zhdata)
## Overall MSA =  0.872
## MSA for each item = 
##   zh1   zh2   zh3   zh4   zh5   zh6   zh7 
## 0.894 0.881 0.887 0.897 0.907 0.821 0.823

6.3 Bartlett’s test

Bartlett球形检验 Bartlett’s test of sphericity p<0.05,统计上显著适合探索性因子分析EFA

cortest.bartlett(zhdata)
## R was not square, finding R from data
## $chisq
## [1] 1820.569
## 
## $p.value
## [1] 0
## 
## $df
## [1] 21

6.4判断因子数量

parallel <- fa.parallel(zhdata, fm = 'minres', fa = 'both',n.iter=100)

## Parallel analysis suggests that the number of factors =  2  and the number of components =  1

推荐提取2个因子

6.5 提取2个因子

efa2 <- fa(zhdata,nfactors = 2,rotate = "oblimin",fm="minres")
## Loading required namespace: GPArotation
print(efa2)
## Factor Analysis using method =  minres
## Call: fa(r = zhdata, nfactors = 2, rotate = "oblimin", fm = "minres")
## Standardized loadings (pattern matrix) based upon correlation matrix
##       MR1   MR2   h2   u2 com
## zh1  0.82  0.02 0.69 0.31   1
## zh2  0.91 -0.06 0.76 0.24   1
## zh3  0.88  0.02 0.78 0.22   1
## zh4  0.73  0.09 0.64 0.36   1
## zh5  0.09  0.74 0.64 0.36   1
## zh6 -0.03  0.87 0.72 0.28   1
## zh7 -0.01  0.91 0.81 0.19   1
## 
##                        MR1  MR2
## SS loadings           2.86 2.18
## Proportion Var        0.41 0.31
## Cumulative Var        0.41 0.72
## Proportion Explained  0.57 0.43
## Cumulative Proportion 0.57 1.00
## 
##  With factor correlations of 
##      MR1  MR2
## MR1 1.00 0.66
## MR2 0.66 1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 2 factors are sufficient.
## 
## df null model =  21  with the objective function =  4.94 with Chi Square =  1820.57
## df of  the model are 8  and the objective function was  0.11 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic n.obs is  373 with the empirical chi square  6.95  with prob <  0.54 
## The total n.obs was  373  with Likelihood Chi Square =  39.38  with prob <  4.2e-06 
## 
## Tucker Lewis Index of factoring reliability =  0.954
## RMSEA index =  0.103  and the 90 % confidence intervals are  0.072 0.136
## BIC =  -7.99
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    MR1  MR2
## Correlation of (regression) scores with factors   0.96 0.95
## Multiple R square of scores with factors          0.92 0.90
## Minimum correlation of possible factor scores     0.84 0.81
print(efa2$loadings,cutoff = 0.5)
## 
## Loadings:
##     MR1    MR2   
## zh1  0.821       
## zh2  0.911       
## zh3  0.875       
## zh4  0.733       
## zh5         0.737
## zh6         0.868
## zh7         0.910
## 
##                  MR1   MR2
## SS loadings    2.816 2.136
## Proportion Var 0.402 0.305
## Cumulative Var 0.402 0.707

6.6图示

EFA的图示1

factor.plot(efa2,labels=rownames(efa2$loadings))

EFA的图示2

fa.diagram(efa2)

7.CFA验证性因子分析Confirmatory Factor Analysis

#设定工作文件夹, 根据自己电脑的情况进行修改
datafolder<-'D:/tdata/tourist_cn_R/'

load是用于载入保存了的rda格式数据

load(paste0(datafolder,'tourist_cn.rda'))
 MR1        MR2

zh1 0.821 zh2 0.911 zh3 0.875 zh4 0.733 zh5 0.737 zh6 0.868 zh7 0.910

7.1 CFA模型设定

cfa_model <- " D1 =~ zh1 + zh2 + zh3 + zh4 
 D2 =~ zh5 + zh6 + zh7 "

7.2 CFA模型估计

cfa.fit <- cfa(cfa_model, data=zhdata)
summary(cfa.fit, fit.measures=TRUE)
## lavaan 0.6.15 ended normally after 25 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        15
## 
##   Number of observations                           373
## 
## Model Test User Model:
##                                                       
##   Test statistic                                48.083
##   Degrees of freedom                                13
##   P-value (Chi-square)                           0.000
## 
## Model Test Baseline Model:
## 
##   Test statistic                              1841.136
##   Degrees of freedom                                21
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.981
##   Tucker-Lewis Index (TLI)                       0.969
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -3488.689
##   Loglikelihood unrestricted model (H1)      -3464.647
##                                                       
##   Akaike (AIC)                                7007.377
##   Bayesian (BIC)                              7066.201
##   Sample-size adjusted Bayesian (SABIC)       7018.610
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.085
##   90 Percent confidence interval - lower         0.060
##   90 Percent confidence interval - upper         0.111
##   P-value H_0: RMSEA <= 0.050                    0.012
##   P-value H_0: RMSEA >= 0.080                    0.655
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.027
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   D1 =~                                               
##     zh1               1.000                           
##     zh2               1.058    0.053   19.985    0.000
##     zh3               1.097    0.053   20.885    0.000
##     zh4               0.999    0.055   18.218    0.000
##   D2 =~                                               
##     zh5               1.000                           
##     zh6               1.108    0.061   18.060    0.000
##     zh7               1.183    0.061   19.257    0.000
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   D1 ~~                                               
##     D2                0.714    0.080    8.978    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .zh1               0.512    0.047   10.851    0.000
##    .zh2               0.452    0.045   10.068    0.000
##    .zh3               0.374    0.042    9.008    0.000
##    .zh4               0.609    0.054   11.319    0.000
##    .zh5               0.534    0.050   10.684    0.000
##    .zh6               0.505    0.052    9.656    0.000
##    .zh7               0.316    0.047    6.751    0.000
##     D1                1.137    0.118    9.613    0.000
##     D2                0.988    0.109    9.074    0.000

7.3 CFA模型图示

semPaths(cfa.fit, what="est", fade=FALSE, residuals=FALSE,edge.label.cex=0.75)

8.信度分析reliability analysis

信度分析参考资料

https://lhbikos.github.io/ReC_Psychometrics/rxx.html

#设定工作文件夹, 根据自己电脑的情况进行修改
datafolder<-'D:/tdata/tourist_cn_R/'

load是用于载入保存了的rda格式数据

load(paste0(datafolder,'tourist_cn.rda'))
if(!require(psych)){install.packages("psych")}

维度1的信度分析

zh_d1<-tourist[,c('zh1','zh2','zh3','zh4')]
#alpha(zh_d1)
splitHalf(zh_d1, raw = TRUE, brute = TRUE)
## Split half reliabilities  
## Call: splitHalf(r = zh_d1, raw = TRUE, brute = TRUE)
## 
## Maximum split half reliability (lambda 4) =  0.92
## Guttman lambda 6                          =  0.89
## Average split half reliability            =  0.91
## Guttman lambda 3 (alpha)                  =  0.91
## Guttman lambda 2                          =  0.91
## Minimum split half reliability  (beta)    =  0.89
## Average interitem r =  0.71  with median =  0.73
##                                              2.5% 50% 97.5%
##  Quantiles of split half reliability      =  0.89 0.91 0.92

维度2的信度分析

zh_d2<-tourist[,c('zh5','zh6','zh7')]
splitHalf(zh_d2, raw = TRUE, brute = TRUE)
## Split half reliabilities  
## Call: splitHalf(r = zh_d2, raw = TRUE, brute = TRUE)
## 
## Maximum split half reliability (lambda 4) =  0.78
## Guttman lambda 6                          =  0.84
## Average split half reliability            =  1.03
## Guttman lambda 3 (alpha)                  =  0.88
## Guttman lambda 2                          =  0.89
## Minimum split half reliability  (beta)    =  0.76
## Average interitem r =  0.72  with median =  0.72
##                                              2.5% 50% 97.5%
##  Quantiles of split half reliability      =  0.76 0.77 0.78