淘先锋技术网

首页 1 2 3 4 5 6 7

MDS简介

MDS是一个统计技术集合,用于可视化地描述距离集合中的相似性和差异性.对于经典的MDS的处理过程包括:输入一个包含数据集中任意两个数据点之间距离的距离矩阵,返回一个坐标集合,这个集合可以近似反应每对数据点之间的距离.

之所以说是近似反应,是因为在二维空间中很可能不存在被一组距离分开的点集. 例如: 3个彼此之间距离都是1的点,是一个等边三角形的顶点.因此,不可能另外一个点到这个三角形的三个顶点的距离都是1.

MDS简单应用

构建距离矩阵

library('foreign')
library('ggplot2')
# 构建不用样本对p1-6的评价矩阵1 0 -1表示
set.seed(851982) # To make sure results are consistent
ex.matrix <- matrix(sample(c(-, , ), , replace = TRUE),
                    nrow = ,
                    ncol = )
row.names(ex.matrix) <- c('A', 'B', 'C', 'D')
colnames(ex.matrix) <- c('P1', 'P2', 'P3', 'P4', 'P5', 'P6')

数据如下

  P1 P2 P3 P4 P5 P6
A   -   -    
B -          
C         -  
D       -    

构建相似性矩阵

这里用A*t(A)表示不同样本间的相似性

ex.mult <- ex.matrix %*% t(ex.matrix)
ex.mult

数据如下

   A  B  C  D
A   - -  
B -     -
C -     -
D   - -  

计算欧氏距离

ex.dist <- dist(ex.mult)
ex.dist

数据如下

         A        B        C
B                   
C           
D   

MDS进行可视化

# Visualize clusters
ex.mds <- cmdscale(ex.dist)
plot(ex.mds, type = 'n')
text(ex.mds, c('A', 'B', 'C', 'D'))

结果:

         A        B        C
B                   
C           
D   

这里写图片描述

书中投票分类例子

dataclean

收集数据文件名

library('foreign')
library('ggplot2')

data.dir <- file.path("data", "roll_call")
data.files <- list.files(data.dir)

data.files

#[1] "sen101kh.dta" "sen102kh.dta"
#[3] "sen103kh.dta" "sen104kh.dta"
#[5] "sen105kh.dta" "sen106kh.dta"
#[7] "sen107kh.dta" "sen108kh_7.dta"
#[9] "sen109kh.dta" "sen110kh_2008.dta"
#[11] "sen111kh.dta"

foreign包读取dta数据

rollcall.data <- lapply(data.files,
                        function(f)
                        {
                          read.dta(file.path(data.dir, f), convert.factors = FALSE)
                        })
# Ninth code snippet
dim(rollcall.data[[1]])
#[]  

head(rollcall.data[[1]])
#cong id state dist lstate party eh1 eh2 name V1 V2 V3 ... V638
#     USA    BUSH    ... 
#     ALABAMA    SHELBY, RIC    ... 
#     ALABAMA    HEFLIN, HOW    ... 
#     ALASKA    STEVENS, TH    ... 
#     ALASKA    MURKOWSKI,    ... 
#     ARIZONA    DECONCINI,    ... 

按照document清洗数据

rollcall.simplified <- function(df)
{
  no.pres <- subset(df, state < )

  for(i in :ncol(no.pres))
  {
    no.pres[,i] <- ifelse(no.pres[,i] > , , no.pres[,i])
    no.pres[,i] <- ifelse(no.pres[,i] >  & no.pres[,i] < , , no.pres[,i])
    no.pres[,i] <- ifelse(no.pres[,i] > , -, no.pres[,i])
  }

  return(as.matrix(no.pres[,:ncol(no.pres)]))
}

rollcall.simple <- lapply(rollcall.data, rollcall.simplified)

计算mDS(important part)

# and calculate the Euclidan distance between each Senator.
rollcall.dist <- lapply(rollcall.simple, function(m) dist(m %*% t(m)))#计算距离矩阵

# Do the multidimensional scaling
rollcall.mds <- lapply(rollcall.dist,
                       function(d) as.data.frame((cmdscale(d, k = )) * -))#用mds生成2维坐标-1表示做左右翻转

构建MDS数据矩阵

congresses <- : #-届国会

for(i in :length(rollcall.mds))
{
  names(rollcall.mds[[i]]) <- c("x", "y")

  congress <- subset(rollcall.data[[i]], state < )

  congress.names <- sapply(as.character(congress$name),
                           function(n) strsplit(n, "[, ]")[[1]][])# [, ]正则表达式 有逗号或空格就拆分字符串

  rollcall.mds[[i]] <- transform(rollcall.mds[[i]],
                                 name = congress.names,
                                 party = as.factor(congress$party),
                                 congress = congresses[i])
}

head(rollcall.mds[[1]])

mds图形化

base.110 <- ggplot(cong., aes(x = x, y = y)) +
  scale_size(range = c(,), guide = 'none') +
  scale_alpha(guide = 'none') +
  theme_bw() + #bw背景
  theme(axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        panel.grid.major = element_blank()) +
  ggtitle("Roll Call Vote MDS Clustering for 110th U.S. Senate") +
  xlab("") +# 无横纵坐标名
  ylab("") +
  scale_shape(name = "Party", breaks = c("100", "200", "328"), #按照不同的Party画不同shape的points
              labels = c("Dem.", "Rep.", "Ind."), solid = FALSE) +# 标签
  scale_color_manual(name = "Party", values = c("100" = "black",
                                                "200" = "dimgray",
                                                "328"="grey"),
                     breaks = c("100", "200", "328"),
                     labels = c("Dem.", "Rep.", "Ind."))


print(base. + geom_text(aes(color = party,
                               alpha = ,
                               label = cong.$name,#在x,y处画名字
                               size = )))

这里写图片描述

按不同届的国会记录画多图

# Fourteenth code snippet
# Create a single visualization of MDS for all Congresses on a grid
all.mds <- do.call(rbind, rollcall.mds)
all.plot <- ggplot(all.mds, aes(x = x, y = y)) +
  geom_point(aes(shape = party, alpha = , size = )) +
  scale_size(range = c(, ), guide = 'none') +
  scale_alpha(guide = 'none') +
  theme_bw() +
  theme(axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        panel.grid.major = element_blank()) +
  ggtitle("Roll Call Vote MDS Clustering for U.S. Senate (101st - 111th Congress)") +
       xlab("") +
       ylab("") +
       scale_shape(name = "Party",
                   breaks = c("100", "200", "328"),
                   labels = c("Dem.", "Rep.", "Ind."),
                   solid = FALSE) +
      facet_wrap(~ congress)

print(all.plot)

这里写图片描述