MDS简介
MDS是一个统计技术集合,用于可视化地描述距离集合中的相似性和差异性.对于经典的MDS的处理过程包括:输入一个包含数据集中任意两个数据点之间距离的距离矩阵,返回一个坐标集合,这个集合可以近似反应每对数据点之间的距离.
之所以说是近似反应,是因为在二维空间中很可能不存在被一组距离分开的点集. 例如: 3个彼此之间距离都是1的点,是一个等边三角形的顶点.因此,不可能另外一个点到这个三角形的三个顶点的距离都是1.
MDS简单应用
构建距离矩阵
library('foreign')
library('ggplot2')
# 构建不用样本对p1-6的评价矩阵1 0 -1表示
set.seed(851982) # To make sure results are consistent
ex.matrix <- matrix(sample(c(-, , ), , replace = TRUE),
nrow = ,
ncol = )
row.names(ex.matrix) <- c('A', 'B', 'C', 'D')
colnames(ex.matrix) <- c('P1', 'P2', 'P3', 'P4', 'P5', 'P6')
数据如下
P1 P2 P3 P4 P5 P6
A - -
B -
C -
D -
构建相似性矩阵
这里用A*t(A)表示不同样本间的相似性
ex.mult <- ex.matrix %*% t(ex.matrix)
ex.mult
数据如下
A B C D
A - -
B - -
C - -
D - -
计算欧氏距离
ex.dist <- dist(ex.mult)
ex.dist
数据如下
A B C
B
C
D
MDS进行可视化
# Visualize clusters
ex.mds <- cmdscale(ex.dist)
plot(ex.mds, type = 'n')
text(ex.mds, c('A', 'B', 'C', 'D'))
结果:
A B C
B
C
D
书中投票分类例子
dataclean
收集数据文件名
library('foreign')
library('ggplot2')
data.dir <- file.path("data", "roll_call")
data.files <- list.files(data.dir)
data.files
#[1] "sen101kh.dta" "sen102kh.dta"
#[3] "sen103kh.dta" "sen104kh.dta"
#[5] "sen105kh.dta" "sen106kh.dta"
#[7] "sen107kh.dta" "sen108kh_7.dta"
#[9] "sen109kh.dta" "sen110kh_2008.dta"
#[11] "sen111kh.dta"
foreign包读取dta数据
rollcall.data <- lapply(data.files,
function(f)
{
read.dta(file.path(data.dir, f), convert.factors = FALSE)
})
# Ninth code snippet
dim(rollcall.data[[1]])
#[]
head(rollcall.data[[1]])
#cong id state dist lstate party eh1 eh2 name V1 V2 V3 ... V638
# USA BUSH ...
# ALABAMA SHELBY, RIC ...
# ALABAMA HEFLIN, HOW ...
# ALASKA STEVENS, TH ...
# ALASKA MURKOWSKI, ...
# ARIZONA DECONCINI, ...
按照document清洗数据
rollcall.simplified <- function(df)
{
no.pres <- subset(df, state < )
for(i in :ncol(no.pres))
{
no.pres[,i] <- ifelse(no.pres[,i] > , , no.pres[,i])
no.pres[,i] <- ifelse(no.pres[,i] > & no.pres[,i] < , , no.pres[,i])
no.pres[,i] <- ifelse(no.pres[,i] > , -, no.pres[,i])
}
return(as.matrix(no.pres[,:ncol(no.pres)]))
}
rollcall.simple <- lapply(rollcall.data, rollcall.simplified)
计算mDS(important part)
# and calculate the Euclidan distance between each Senator.
rollcall.dist <- lapply(rollcall.simple, function(m) dist(m %*% t(m)))#计算距离矩阵
# Do the multidimensional scaling
rollcall.mds <- lapply(rollcall.dist,
function(d) as.data.frame((cmdscale(d, k = )) * -))#用mds生成2维坐标-1表示做左右翻转
构建MDS数据矩阵
congresses <- : #-届国会
for(i in :length(rollcall.mds))
{
names(rollcall.mds[[i]]) <- c("x", "y")
congress <- subset(rollcall.data[[i]], state < )
congress.names <- sapply(as.character(congress$name),
function(n) strsplit(n, "[, ]")[[1]][])# [, ]正则表达式 有逗号或空格就拆分字符串
rollcall.mds[[i]] <- transform(rollcall.mds[[i]],
name = congress.names,
party = as.factor(congress$party),
congress = congresses[i])
}
head(rollcall.mds[[1]])
mds图形化
base.110 <- ggplot(cong., aes(x = x, y = y)) +
scale_size(range = c(,), guide = 'none') +
scale_alpha(guide = 'none') +
theme_bw() + #bw背景
theme(axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
panel.grid.major = element_blank()) +
ggtitle("Roll Call Vote MDS Clustering for 110th U.S. Senate") +
xlab("") +# 无横纵坐标名
ylab("") +
scale_shape(name = "Party", breaks = c("100", "200", "328"), #按照不同的Party画不同shape的points
labels = c("Dem.", "Rep.", "Ind."), solid = FALSE) +# 标签
scale_color_manual(name = "Party", values = c("100" = "black",
"200" = "dimgray",
"328"="grey"),
breaks = c("100", "200", "328"),
labels = c("Dem.", "Rep.", "Ind."))
print(base. + geom_text(aes(color = party,
alpha = ,
label = cong.$name,#在x,y处画名字
size = )))
按不同届的国会记录画多图
# Fourteenth code snippet
# Create a single visualization of MDS for all Congresses on a grid
all.mds <- do.call(rbind, rollcall.mds)
all.plot <- ggplot(all.mds, aes(x = x, y = y)) +
geom_point(aes(shape = party, alpha = , size = )) +
scale_size(range = c(, ), guide = 'none') +
scale_alpha(guide = 'none') +
theme_bw() +
theme(axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
panel.grid.major = element_blank()) +
ggtitle("Roll Call Vote MDS Clustering for U.S. Senate (101st - 111th Congress)") +
xlab("") +
ylab("") +
scale_shape(name = "Party",
breaks = c("100", "200", "328"),
labels = c("Dem.", "Rep.", "Ind."),
solid = FALSE) +
facet_wrap(~ congress)
print(all.plot)