Няма описание

taopp.py 4.8KB

    #!/usr/bin/env python # coding: utf-8 import urllib2 import urllib import requests from bs4 import BeautifulSoup from lxml import etree import MySQLdb import time import random import sys reload(sys) sys.setdefaultencoding('utf8') """猫眼不需要http请求的头部信息,添加上反而会出错""" class TaoPP(): # 获取当前时间 def getCurrentTime(self): return time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime(time.time())) def __init__(self): self.time = time try: #self.db = MySQLdb.connect('127.0.0.1', 'flyer_user', 'Tu4a0X9hOPKz6jS!e', 'flyer_db',22809) self.db = MySQLdb.connect('rm-2zeqhsdou4zxl4914.mysql.rds.aliyuncs.com', 'flyer_user', 'GLZcMigd4VnsrmzNrOC1HGjv2eQzBqg3', 'flyer_db', 3306) self.cur = self.db.cursor() except MySQLdb.Error, e: print self.getCurrentTime(), "连接数据库错误,原因%d: %s" % (e.args[0], e.args[1]) # 插入数据 def insertData(self, my_dict): try: self.db.set_character_set('utf8') cols = ', '.join(my_dict.keys()) print cols values = '","'.join(my_dict.values()) print values sql = "INSERT INTO taopp (%s) VALUES (%s)" % (cols, '"' + values + '"') try: result = self.cur.execute(sql) insert_id = self.db.insert_id() self.db.commit() # 判断是否执行成功 if result: return insert_id else: return 0 except MySQLdb.Error, e: # 发生错误时回滚 self.db.rollback() # 主键唯一,无法插入 if "key 'PRIMARY'" in e.args[1]: print self.getCurrentTime(), "数据已存在,未插入数据" else: print self.getCurrentTime(), "插入数据失败,原因 %d: %s" % (e.args[0], e.args[1]) except MySQLdb.Error, e: print self.getCurrentTime(), "数据库错误,原因%d: %s" % (e.args[0], e.args[1]) # self.db.close def dealData(self, my_dict): """先从数据库中查询这个影片名的记录,如果没有直接插入; 若是数据库中已经存在数据,那么什么都不做""" self.db.set_character_set('utf8') filmTitile = my_dict.get('film_title') result = self.findFilmByfilmTitle(filmTitile) if len(result) == 0: self.insertData(my_dict) return else: pass def findFilmByfilmTitle(self, filmTitile): """通过影片名称去数据库中查询""" self.db.set_character_set('utf8') sql = "select * from taopp where film_title='%s'" % (filmTitile) self.cur.execute(sql) results = self.cur.fetchall() self.db.commit() return results def getHtml(self,url): """获取html""" req = urllib2.Request(url) html = urllib2.urlopen(req).read().decode('utf-8') # print html return html def get_detail_url(self, targetUrl): """从概览页面中去获取剧透影片的详情页面""" html = self.getHtml(targetUrl) detailPath = etree.HTML(html) detail_urls = detailPath.xpath(r'/html/body/div[4]/div[1]/div[2]/div[1]/div/a[1]/@href') others_urls = detailPath.xpath(r'/html/body/div[4]/div[1]/div[2]/div[2]/div/a[1]/@href') for url in others_urls: # print url self.getContents(url) for url in detail_urls: self.getContents(url) # print url def getContents(self, url): tmp1 = [] tmp2 = [] html = self.getHtml(url) detailPath = etree.HTML(html) film_title = detailPath.xpath('/html/body/div[3]/div[4]/div/h3/text()') for name in film_title: # print name tmp1.append('film_title') tmp2.append(name) if len(name) != 0: break pic_url = detailPath.xpath('/html/body/div[3]/div[4]/div/div[1]/img/@src') for pu in pic_url: # print pu tmp1.append('pic_url') tmp2.append(pu) release_time = detailPath.xpath('/html/body/div[3]/div[4]/div/div[2]/text()') for rt in release_time: # print rt[5:] tmp1.append('release_time') tmp2.append(rt[5:15]) result_dict = dict(zip(tmp1, tmp2)) # self.time.sleep(1) self.dealData(result_dict) print '**' * 20 for k, v in result_dict.items(): print k, v if __name__ == '__main__': taopp = TaoPP() taopp.get_detail_url('https://dianying.taobao.com/showList.htm?spm=a1z21.3046609.header.4.bpD8SA&n_s=new')