|
#!/usr/bin/env python
# coding: utf-8
import urllib2
import urllib
import requests
from bs4 import BeautifulSoup
from lxml import etree
import MySQLdb
import time
import random
import sys
reload(sys)
sys.setdefaultencoding('utf8')
"""猫眼不需要http请求的头部信息,添加上反而会出错"""
class TaoPP():
# 获取当前时间
def getCurrentTime(self):
return time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime(time.time()))
def __init__(self):
self.time = time
try:
#self.db = MySQLdb.connect('127.0.0.1', 'flyer_user', 'Tu4a0X9hOPKz6jS!e', 'flyer_db',22809)
self.db = MySQLdb.connect('rm-2zeqhsdou4zxl4914.mysql.rds.aliyuncs.com', 'flyer_user', 'GLZcMigd4VnsrmzNrOC1HGjv2eQzBqg3', 'flyer_db', 3306)
self.cur = self.db.cursor()
except MySQLdb.Error, e:
print self.getCurrentTime(), "连接数据库错误,原因%d: %s" % (e.args[0], e.args[1])
# 插入数据
def insertData(self, my_dict):
try:
self.db.set_character_set('utf8')
cols = ', '.join(my_dict.keys())
print cols
values = '","'.join(my_dict.values())
print values
sql = "INSERT INTO taopp (%s) VALUES (%s)" % (cols, '"' + values + '"')
try:
result = self.cur.execute(sql)
insert_id = self.db.insert_id()
self.db.commit()
# 判断是否执行成功
if result:
return insert_id
else:
return 0
except MySQLdb.Error, e:
# 发生错误时回滚
self.db.rollback()
# 主键唯一,无法插入
if "key 'PRIMARY'" in e.args[1]:
print self.getCurrentTime(), "数据已存在,未插入数据"
else:
print self.getCurrentTime(), "插入数据失败,原因 %d: %s" % (e.args[0], e.args[1])
except MySQLdb.Error, e:
print self.getCurrentTime(), "数据库错误,原因%d: %s" % (e.args[0], e.args[1])
# self.db.close
def dealData(self, my_dict):
"""先从数据库中查询这个影片名的记录,如果没有直接插入;
若是数据库中已经存在数据,那么什么都不做"""
self.db.set_character_set('utf8')
filmTitile = my_dict.get('film_title')
result = self.findFilmByfilmTitle(filmTitile)
if len(result) == 0:
self.insertData(my_dict)
return
else:
pass
def findFilmByfilmTitle(self, filmTitile):
"""通过影片名称去数据库中查询"""
self.db.set_character_set('utf8')
sql = "select * from taopp where film_title='%s'" % (filmTitile)
self.cur.execute(sql)
results = self.cur.fetchall()
self.db.commit()
return results
def getHtml(self,url):
"""获取html"""
req = urllib2.Request(url)
html = urllib2.urlopen(req).read().decode('utf-8')
# print html
return html
def get_detail_url(self, targetUrl):
"""从概览页面中去获取剧透影片的详情页面"""
html = self.getHtml(targetUrl)
detailPath = etree.HTML(html)
detail_urls = detailPath.xpath(r'/html/body/div[4]/div[1]/div[2]/div[1]/div/a[1]/@href')
others_urls = detailPath.xpath(r'/html/body/div[4]/div[1]/div[2]/div[2]/div/a[1]/@href')
for url in others_urls:
# print url
self.getContents(url)
for url in detail_urls:
self.getContents(url)
# print url
def getContents(self, url):
tmp1 = []
tmp2 = []
html = self.getHtml(url)
detailPath = etree.HTML(html)
film_title = detailPath.xpath('/html/body/div[3]/div[4]/div/h3/text()')
for name in film_title:
# print name
tmp1.append('film_title')
tmp2.append(name)
if len(name) != 0:
break
pic_url = detailPath.xpath('/html/body/div[3]/div[4]/div/div[1]/img/@src')
for pu in pic_url:
# print pu
tmp1.append('pic_url')
tmp2.append(pu)
release_time = detailPath.xpath('/html/body/div[3]/div[4]/div/div[2]/text()')
for rt in release_time:
# print rt[5:]
tmp1.append('release_time')
tmp2.append(rt[5:15])
result_dict = dict(zip(tmp1, tmp2))
# self.time.sleep(1)
self.dealData(result_dict)
print '**' * 20
for k, v in result_dict.items():
print k, v
if __name__ == '__main__':
taopp = TaoPP()
taopp.get_detail_url('https://dianying.taobao.com/showList.htm?spm=a1z21.3046609.header.4.bpD8SA&n_s=new')
|