日期: 2023-01-19 22:44:10
python爬取并存储nsfwpics网页源码(现网址http://picxxxx.top/),存储图片链接和图片来源网站并存入mysql。
还没有开始爬图,目前存储了334*6页的网页源码,和这些网页上的图片信息。
为什么以前的一些图片不能访问了?有没有hxd以前爬过的?我想要2020-5-18之后更新的图片。
使用mysql 8.0.23 社区版,python3.8.5。
mysqldump备份出来的库:https://pan.baidu.com/s/1m0z1V6t-bRk5nXHj_Wyydw FULI
源码,写的很乱(需要自行修改):
import requests
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
import time
import pymysql
import logging
import re
import os
import traceback
root_url = 'http://picxxxx.top/'
start_page, end_page = 1, 10
url_p = re.compile(r'''^http://picxxxx.top/.+.html
'', flags=re.I)
img_se_path = r'D:spider-picxxxxpics'
video_se_path = r'D:spider-picxxxxvideos'
# 目录页面:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.63'
}
def get_logger():
try:
formatter = logging.Formatter(
'%(lineno)d : %(asctime)s : %(levelname)s : %(funcName)s : %(message)s')
fileHandler = logging.FileHandler(
'D:spider-picxxxx运行记录{}.txt'.format(time.strftime('-%Y-%m-%d')), mode='w', encoding='utf-8')
fileHandler.setFormatter(formatter)
log = logging.getLogger('logger')
log.setLevel(logging.DEBUG)
log.addHandler(fileHandler)
return log
except:
print('logger初始化出错')
return None
start_time = time.process_time()
log = get_logger()
msg = '开始时间:{}'.format(time.strftime('%Y-%m-%d %H:%M:%S'))
print(msg)
log.info(msg)
conn = pymysql.connect(host='127.0.0.1',
user='user',
passwd='passwd',
db='nsfwpic',
charset='utf8')
cur = conn.cursor()
# 设置重连次数。包括开头,若出现异常,最多尝试连接4次
session = requests.Session()
session.mount('http://', HTTPAdapter(max_retries=3))
session.mount('https://', HTTPAdapter(max_retries=3))
def make_soup(url):
xml = session.get(url, timeout=60, headers=headers)
bsObj = BeautifulSoup(xml.text, 'lxml')
xml.encoding = xml.apparent_encoding
return bsObj
def get_last_page(root_url):
bsObj = make_soup(root_url)
last_page = bsObj.find('a', {'href': re.compile(
r'''^http://picxxxx.top/page/d{3}/
'', flags=re.I)})
start, end = re.search(re.compile(r'''d{3}'''), last_page['href']).span()
last_page = last_page['href'][start:end]
last_page = int(last_page)
msg = '现总页数为{}'.format(last_page)
log.info(msg)
print(msg)
return last_page
def get_new_pics_pages(root_url):
last_page = get_last_page(root_url)
index_pages = []
msg = '本次从{}页到{}页'.format(start_page, end_page)
log.info(msg)
print(msg)
for i in range(start_page, end_page+1):
index_pages.append(r'http://picxxxx.top/page/'+str(i)+'/')
for index_page in index_pages:
bsObj = make_soup(index_page)
url_tags = bsObj.find_all('a', {'href': url_p})
urls = []
for url_tag in url_tags:
url = url_tag['href']
urls.append(url)
for url in urls:
cur.execute(
'''SELECT no FROM picxxxx_pages WHERE url="%s";''' % (url))
existence = cur.fetchall()
if len(existence) == 0:
cur.execute('''INSERT INTO picxxxx_pages (url) VALUES ("%s");''' % (
url))
conn.commit()
log.info('发现新页面:{}'.format(url))
else:
log.info('旧页面{}'.format(url))
time.sleep(1)
def se_new_pics_pages_content(root_url):
cur.execute('SELECT url FROM picxxxx_pages WHERE content is NULL')
new_pics_pages = cur.fetchall()
if len(new_pics_pages) == 0:
return
for url in new_pics_pages:
url = url[0]
bsObj = make_soup(url)
content = str(bsObj)
content = content.replace('', '\')
content = content.replace('"', '"')
content = content.replace("'", "'")
if content != None:
cur.execute('''UPDATE picxxxx_pages SET content="%s" WHERE url="%s";''' % (
content, url))
conn.commit()
log.info('抓取页面:{}'.format(url))
else:
cur.execute('''INSERT INTO picxxxx_pages (url,content) VALUES ("%s","unknown");''' % (
url))
conn.commit()
log.info('失败抓取页面:{}'.format(url))
time.sleep(1)
def update_img_urls():
cur.execute(
'SELECT url,content FROM picxxxx_pages WHERE checked=0 AND content!="unknown" AND content IS NOT NULL;')
to_do_list = cur.fetchall()
for url, content in to_do_list:
img_urls = []
content = content.replace('\', '')
content = content.replace('"', '"')
content = content.replace("'", "'")
bsObj = BeautifulSoup(content, 'lxml')
img_tags = bsObj.find_all('img')
for img_tag in img_tags:
for attr in ['src', 'data-src']:
img_url = img_tag.get(attr)
if img_url != None and img_url not in img_urls:
img_urls.append(img_url)
img_urls = set(img_urls)
img_urls = list(img_urls)
img_urls.sort()
for img_url in img_urls:
cur.execute(
'SELECT no FROM picxxxx_pics WHERE img_url="%s";' % (img_url))
existence = cur.fetchall()
if len(existence) != 0:
continue
else:
cur.execute(
'''INSERT INTO picxxxx_pics (url,img_url) VALUES ("%s","%s");''' % (url, img_url))
conn.commit()
msg = '图片入库{}'.format(img_url)
log.info(msg)
cur.execute(
'UPDATE picxxxx_pages SET checked=1 WHERE url="%s";' % (url))
conn.commit()
def download_img():
cur.execute('''SELECT url,img_url FROM picxxxx_pics WHERE downloaded=0;''')
items = cur.fetchall()
total_num = len(items)
for item in items:
num = items.index(item)+1
url, img_url = item
headers['Referer'] = url # 加上来源网站,规避网站的反盗链
img = session.get(img_url, timeout=30, headers=headers)
img = img.content
img_name = img_url.split('#')[0]
img_name = img_name.split('/')[-1]
img_name = time.strftime('%Y-%m-%d-%H%M%S')+img_name
place = os.path.join(img_se_path, img_name)
with open(place, 'wb') as f:
f.write(img)
f.close()
msg = '图片下载成功{}/{},{:>100}'.format(num, total_num, img_url)
print(' '+msg, end='')
log.info(msg)
place = place.replace('', '\')
place = place.replace('"', '"')
place = place.replace("'", "'")
cur.execute(
'UPDATE picxxxx_pics SET place="%s",downloaded=1 where img_url="%s";' % (place, img_url))
conn.commit()
time.sleep(0.5)
# https://search.pstatic.net/common?src=https://i.imgur.com/DbxbIr4.jpg#vwid=853&vhei=1280
#
# 视频
#
def main():
# get_new_pics_pages(root_url)
# se_new_pics_pages_content(root_url)
update_img_urls()
# download_img()
# update_video_urls()
# download_video()
if __name__ == '__main__':
try:
main()
except Exception as e:
traceback.print_exc()
log.error(traceback.format_exc())
conn = pymysql.connect(host='127.0.0.1',
user='user',
passwd='passwd',
db='nsfwpic',
charset='utf8')
cur = conn.cursor()
log.error('休息15分钟,重新出发')
time.sleep(900)
main()
finally:
cur.close()
conn.close()
# nsfwpic数据库
'''CREATE TABLE picxxxx_pages(从目录页上爬下来的图片页
no INT UNSIGNED NOT NULL AUTO_INCREMENT,编号
url VARCHAR(255) NOT NULL DEFAULT "unknown",域名
checked TINYINT NOT NULL DEFAULT 0,页面上的图片是否已入库
content MEDIUMTEXT,页面内容(原html格式)
created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,建档时间
PRIMARY KEY(no));
'''
'''CREATE TABLE picxxxx_pics(图片信息库
no INT UNSIGNED NOT NULL AUTO_INCREMENT,编号
url VARCHAR(255) NOT NULL DEFAULT "unknown",来自哪个网页
img_url VARCHAR(255) NOT NULL DEFAULT "unknown",图片域名
place VARCHAR(255) NOT NULL DEFAULT "unknown",存放位置
downloaded TINYINT NOT NULL DEFAULT 0,是否下载过
created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,建档时间
PRIMARY KEY(no));
'''
复制代码