介绍
介绍
福哥更新了Spider对象,实现了爬取网页内容,分析网页数据,写入数据库这些功能。实现了这些功能之后网页蜘蛛的基本功能也就差不多完成了!后面就可以开始编写搜索引擎的前端UI功能了,相对网页蜘蛛来说,搜索引擎前端UI就比较简单了!
但是,网页蜘蛛是搜索引擎的基础,只有采集到足够多的数据,搜索引擎才可以在也会查询的时候返回用户想看到的结果内容。所以,今天这一课童鞋们一定要好好学习了!
增加字段
在前面设计数据表的时候出了一个纰漏,福哥没有在webpages数据表里建立与websites数据表的关联键,这样就无法在webpages里提取一个域名下的所有网页信息了。
新增字段
我们需要在webpages数据表里增加关联键字段websiteId用来关联websites数据表的主键字段。
SQL
SQL语句如下
mysql> alter table webpages add column websiteId int not null after webpageId, add key websiteId (websiteId); Query OK, 0 rows affected (0.18 sec) Records: 0 Duplicates: 0 Warnings: 0 mysql>
清除数据
因为之前福哥没有给webpages表增加关联键websiteId,所以上一课我们采集的webpages数据都缺少websiteId值,为了修复这个问题我们需要将webpages表的数据清空。
清除数据有两个方法,一个是使用DELETE语句删除所有数据,另外一个是通过TRUNCATE语句重置数据表,区别是TRUNCATE不单单会删除全部数据还会将一切设置归零,包括自增列的计数都会归零。
SQL
DELETE语法如下
mysql> delete from webpages; Query OK, 388 rows affected (0.00 sec) mysql>
TRUNCATE语法如下
mysql> truncate table webpages; Query OK, 0 rows affected (0.01 sec) mysql>
Spider对象
方法
getPendingDomains
def getPendingDomains(self, nums): try: self.mysqlCS.execute("SELECT * FROM websites ORDER BY nextFetchDT ASC, lastFetchDT DESC LIMIT %s", nums) rows = self.mysqlCS.fetchall() return rows except Exception as e: print("Fail to getPendingDomains") print(e) self.mysqlCN.rollback() return None
getPendingPages
def getPendingPages(self, domainName, nums): try: self.mysqlCS.execute("SELECT * FROM websites WHERE domainName = %s", [domainName]) row = self.mysqlCS.fetchone() if row is not None: websiteId = row[0] self.mysqlCS.execute("SELECT * FROM webpages WHERE websiteId = %s ORDER BY lastFetchDT DESC LIMIT %s", [websiteId, nums]) rows = self.mysqlCS.fetchall() return rows except Exception as e: print("Fail to getPendingPages") print(e) self.mysqlCN.rollback() return None
analyzePage
def analyzePage(self, url): title = self.chrome.title keywords = "" description = "" charset = "" pageSource = self.chrome.page_source stRE = re.compile("<\/?[a-z0-9]+[^\>]*>", re.M | re.I) scriptRE = re.compile("<(script).*?>[\s\S]*?<\/\1>", re.M | re.I) bcRE = re.compile("[\s\t\r\n]+", re.M | re.I) pageText = self.chrome.page_source.encode("gbk", "ignore") pageText = re.sub(stRE, " ", pageText) pageText = re.sub(scriptRE, " ", pageText) pageText = re.sub(bcRE, " ", pageText) pageText = pageText.decode("gbk") metas = self.chrome.find_elements_by_tag_name("meta") for meta in metas: myName = meta.get_attribute("name") myContent = meta.get_attribute("content") myHttpEquiv = meta.get_attribute("http-equiv") if myName == "keywords": keywords = myContent elif myName == "description": description = myContent elif myHttpEquiv is not None and myHttpEquiv.lower() == "content-type": myCharset = myContent csRE = re.compile("charset\=([^\;]+)", re.M | re.I) mats = csRE.search(myCharset) if mats: charset=mats.group(1).lower() return { "url": url, "title": title, "keywords": keywords, "description": description, "charset": charset, "pageText":pageText }
findAllATags
def findAllATags(self, websiteId, domainName): links = self.chrome.find_elements_by_tag_name("a") for link in links: try: myHref = link.get_attribute("href").encode("gbk") myWebPageType = self.isWebPage(myHref, domainName) if myWebPageType == 1: ret = self.saveWebPage(websiteId, myHref) elif myWebPageType == 2: lnkRE = re.compile("^(http|https)\:\/\/([^\/]+)", re.M | re.I) mats = lnkRE.search(myHref) if mats: ret = self.saveDomainPage(mats.group(2).lower()) except Exception as e: doNothing = e
fetchDomainURL
def fetchDomainURL(self, websiteId, domainName): # open page of domain try: url = "http://" + domainName print("打开网址:" + url) self.chrome.get(url) except Exception as e: print("Failed to fetch domain URL: " + domainName) print(e) return False # find all A tags self.findAllATags(websiteId, domainName) return True
fetchWebURL
def fetchWebURL(self, websiteId, domainName, url): # open page of url try: print("打开网址:" + url) self.chrome.get(url) except Exception as e: print("Failed to fetch web URL: " + url) print(e) return False # find all A tags self.findAllATags(websiteId, domainName) return True
fetchDomainPageURL
def fetchDomainPageURL(self, websiteId, domainName, webpageId, url): # open page of url try: print("打开网页: " + url) self.chrome.get(url) except Exception as e: print("Failed to fetch domain page URL: " + url) print(e) return False # analyze page print("分析网页: " + url) args = self.analyzePage(url) # save web page full data self.updateWebPage(webpageId, args) # find all A tags self.findAllATags(websiteId, domainName) return False
saveWebPage
def saveWebPage(self, websiteId, url): # try to insert url into webpages try: self.mysqlCS.execute("SELECT * FROM webpages WHERE url = %s", [url]) row = self.mysqlCS.fetchone() if row is None: self.mysqlCS.execute("INSERT INTO webpages (websiteId, url, statusCode, charset, createDT) VALUES (%s, %s, %s, %s, now())", [websiteId, url, 200, 1]) self.mysqlCN.commit() print ("保存网页: " + url) return self.mysqlCS.lastrowid else: return row[0] except Exception as e: print("Fail to saveWebPage") print(e) return None
updateWebPage
def updateWebPage(self, webpageId, args): try: self.mysqlCS.execute( "UPDATE webpages SET title = %s, keywords = %s, description = %s, lastFetchDT = now() WHERE webpageId = %s", [args["title"], args["keywords"].encode("utf-8"), args["description"].encode("utf-8"), webpageId]) self.mysqlCN.commit() print("更新网页: " + args["url"]) return webpageId except Exception as e: print("Failed to updateWebPage") print(e) return None
主程序
主程序代码也有所更新,除了之前进行的域名爬取处理之外,还增加了对域名下面的网页的爬取处理。
from lib.Spider import * mySpider = Spider() mySpider.open() domains = mySpider.getPendingDomains(10) if domains is not None: for domain in domains: mySpider.fetchDomainURL(domain[0], domain[1]) pages = mySpider.getPendingPages(domain[1], 100) if pages is not None: for page in pages: mySpider.fetchDomainPageURL(domain[0], domain[1], page[0], page[2]) mySpider.close()
讲解
getPendingDomains
根据最后爬取时间反序获取最老的域名列表,这就是本次任务需要爬取的域名信息
在websites表里查询出指定数量的域名信息
getPendingPages
根据最后爬取时间反序获取最老的网址列表,这就是当前域名本次任务需要爬取的网址信息
通过域名获取到websites数据表的相关记录
通过websites.websiteId字段的值在webpages表里查询出指定数量的该域名所属的网页信息
analyzePage
使用正则表达式将meta信息里面的charset信息提取出来了
findAllATags
增加了websiteId参数,传递给saveWebPage方法
fetchDomainURL
增加了websiteId参数,传递给findAllATags方法
fetchWebURL
增加了websiteId参数,传递给findAllATags方法
fetchDomainPageURL
打开指定的url的网页
分析网页的信息,包括标题、关键字、描述等等信息
将网页的信息更新到webpages数据表里面
扫描网页的超链接,提取子级页面网址
saveWebPage
增加了websiteId参数,在写入webpages数据表的时候传递websiteId参数
updateWebPage
根据webpageId更新webpages数据表的相关字段的值
总结
我们的网页蜘蛛已经完成80%以上了,剩下的就是把网页的文字数据提取出来写入到数据库当中,同时将数据库当中的网页数据同步到ElasticSearch搜索引擎里面,采集的工作就算基本完成了!
下一节课,福哥将带着大家完成网页蜘蛛的最后一部分代码的编写,大家要好好学习哦~~
P.S.
微信公众号的文章发出去之后是不能编辑的,但是福哥偶尔会修复一些描述不到位、示例不正确、结构不清晰等等的文章错误,这些只能在网站上才能看到最新版本内容,望大家知晓~~