#xpath+正则表达式提取静态网页的内容
#参考了http://blog.csdn.net/qq_34773726/article/details/72546163?locationNum=12&fps=1
library(stringr)
library(xml2)
library(rvest)
data.frame()->alldf
vector()->keywords_final
site="https://www.zhipin.com/c101020100/h_101020100/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&page="
page=1
for(page in 1:20)
{
site=paste(site,page,sep="")
webpage=read_html(site)
#工作job
html_nodes(webpage,'.info-primary .name') %>% html_nodes("a")->job1
str_extract(job1,">.+<span")->job2
str_sub(job2,start=2,end=-6)->job
#公司company
html_nodes(webpage,'.info-company .company-text .name')%>% html_node("a")->company1
str_extract(company1,">.+</a")->company2
str_sub(company2,start=2,end=-4)->company
#工资salary
html_nodes(webpage,'.info-primary .name .red') ->salary1
str_extract(salary1,">.+</span")->salary2
str_sub(salary2,start=2,end=-7)->salary
#工作地点location
html_nodes(webpage,'.job-primary .info-primary') %>% html_node("p") ->location1
str_extract(location1,"p>.+?<em")->location2
str_sub(location2,start=3,end=-4)->location
#工作经验experience
str_extract(location1,"/em>.+?<em")->exp2
str_sub(exp2,start=5,end=-4)->experience
#学历education
str_extract(location1,"/em>(?=((?!/em>).)*$).+?</p")->edu2 #c(?=((?!c).)*$) 可以匹配后一个出现的字符c
str_sub(edu2,start=5,end=-4)->education
#公司类型company_type
html_nodes(webpage,".info-company .company-text")%>% html_node("p")->ct1
str_extract(ct1,"p>.+?<em")->ct2
str_sub(ct2,start=3,end=-4)->company_type
#经济状况economy_state
str_extract(ct1,"/em>.+?<em")->es2
str_sub(es2,start=5,end=-4)->economy_state
#公司规模size
str_extract(ct1,"/em>(?=((?!/em>).)*$).+?</p")->size2
str_sub(size2,start=5,end=-4)->size
#发布时间date
html_nodes(webpage,".job-time .time")->date1
str_extract(date1,"发布于.+?<")->date2
str_sub(date2,start=4,end=-2)->date
#关键词keywords
html_nodes(webpage,".job-tags") %>% html_nodes("span")->keywords1
str_extract(keywords1,">.+<")->keywords2
str_sub(keywords2,start=2,end=-2)->keywords
cbind(job,company,salary,location,experience,education,company_type,economy_state,size,date)->newdf
rbind(alldf,newdf)->alldf
c(keywords_final,keywords)->keywords_final
}
write.csv(alldf,file="alldf.csv")
write.csv(keywords_final,file="keywords.csv")
table(alldf$salary)
#PS:html_text()可以直接获得某节点的所有文本,但是因为我们这里很多节点包含了文本和其他节点,因此用正则表达式更加容易提取
[R爬虫]看看数据分析师的薪资
分享好友
分享这个小栈给你的朋友们,一起进步吧。
订阅须知
• 所有用户可根据关注领域订阅专区或所有专区
• 付费订阅:虚拟交易,一经交易不退款;若特殊情况,可3日内客服咨询
• 专区发布评论属默认订阅所评论专区(除付费小栈外)
技术专家
查看更多- 小雨滴专家