[R爬虫]看看数据分析师的薪资

2017-11-01 20:09:48

#xpath+正则表达式提取静态网页的内容
#参考了http://blog.csdn.net/qq_34773726/article/details/72546163?locationNum=12&fps=1

library(stringr)
library(xml2)
library(rvest)

data.frame()->alldf
vector()->keywords_final
site="https://www.zhipin.com/c101020100/h_101020100/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&page="
page=1

for(page in 1:20)
{
 site=paste(site,page,sep="")
 webpage=read_html(site)
 
#工作job
 html_nodes(webpage,'.info-primary .name') %>%  html_nodes("a")->job1
 str_extract(job1,">.+<span")->job2
 str_sub(job2,start=2,end=-6)->job
 
#公司company
 html_nodes(webpage,'.info-company .company-text .name')%>%  html_node("a")->company1
 str_extract(company1,">.+</a")->company2
 str_sub(company2,start=2,end=-4)->company

#工资salary
 html_nodes(webpage,'.info-primary .name .red') ->salary1
 str_extract(salary1,">.+</span")->salary2
 str_sub(salary2,start=2,end=-7)->salary

#工作地点location
 html_nodes(webpage,'.job-primary .info-primary') %>%  html_node("p") ->location1
 str_extract(location1,"p>.+?<em")->location2
 str_sub(location2,start=3,end=-4)->location

#工作经验experience
 str_extract(location1,"/em>.+?<em")->exp2
 str_sub(exp2,start=5,end=-4)->experience

#学历education
 str_extract(location1,"/em>(?=((?!/em>).)*$).+?</p")->edu2            #c(?=((?!c).)*$) 可以匹配后一个出现的字符c
 str_sub(edu2,start=5,end=-4)->education

#公司类型company_type
 html_nodes(webpage,".info-company .company-text")%>%  html_node("p")->ct1
 str_extract(ct1,"p>.+?<em")->ct2
 str_sub(ct2,start=3,end=-4)->company_type

#经济状况economy_state
str_extract(ct1,"/em>.+?<em")->es2
 str_sub(es2,start=5,end=-4)->economy_state

#公司规模size
 str_extract(ct1,"/em>(?=((?!/em>).)*$).+?</p")->size2
 str_sub(size2,start=5,end=-4)->size

#发布时间date
 html_nodes(webpage,".job-time .time")->date1
 str_extract(date1,"发布于.+?<")->date2
 str_sub(date2,start=4,end=-2)->date 

#关键词keywords
 html_nodes(webpage,".job-tags") %>% html_nodes("span")->keywords1
 str_extract(keywords1,">.+<")->keywords2
 str_sub(keywords2,start=2,end=-2)->keywords

 cbind(job,company,salary,location,experience,education,company_type,economy_state,size,date)->newdf 
 rbind(alldf,newdf)->alldf
 c(keywords_final,keywords)->keywords_final
}

write.csv(alldf,file="alldf.csv")
write.csv(keywords_final,file="keywords.csv")

table(alldf$salary)

#PS:html_text()可以直接获得某节点的所有文本，但是因为我们这里很多节点包含了文本和其他节点，因此用正则表达式更加容易提取

分享好友

分享这个小栈给你的朋友们，一起进步吧。

R语言

创建时间：2020-06-15 11:46:51

R是用于统计分析、绘图的语言和操作环境。R是属于GNU系统的一个自由、免费、源代码开放的软件，它是一个用于统计计算和统计制图的工具。

展开

订阅须知

• 所有用户可根据关注领域订阅专区或所有专区

• 付费订阅：虚拟交易，一经交易不退款；若特殊情况，可3日内客服咨询

• 专区发布评论属默认订阅所评论专区（除付费小栈外）

技术专家

查看更多

小雨滴
专家