所在分类:  其它网络技术 所属圈子: 聊天 计算机和网络技术

终于到威望2了,发个用python写的QA爬虫脚本来看下有没有Python爱好者

发帖6次 被置顶0次 被推荐0次 质量分1星 回帖互动59次 历史交流热度7.44% 历史交流深度0%
自学的不专业,写的不规范可能错误多,源码发出来看下有没有专业的或者爱好者来给点建议。这个的主要目的是找出客户关心的点,通过词频先宏观的看最关注点。
现在只是把内容抓下来,要生成词云的话还是得手动复制到一些词频统计的网站。ANSWER文件会把一些不需要的东西也爬下来,懒得改了,因为觉得QUETION才是重点。
import requests, threading, time
from bs4 import BeautifulSoup
from collections import Counter
from queue import Queue
import sys, os

def get_session():
return requests.session()

# 获取resp
def fetch(session, url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'refer': 'https://www.amazon.com/'
}
resp = session.get(url, headers=headers)
return resp

# 获取当前的pages数
def get_pages(session, url):
url = url.format(asin, 2)
resp = fetch(session, url)
soup = BeautifulSoup(resp.text, 'lxml')
pages = soup.select('.a-pagination > li:nth-last-child(2) > a')[0].text
return pages

# 获取QA List
def get_qa(session, urlList):
while urlList._qsize():
url = urlList.get()
resp = fetch(session, url)
soup = BeautifulSoup(resp.text, 'lxml')
group_qa = soup.select('.askTeaserQuestions > .a-fixed-left-grid.a-spacing-base > .a-fixed-left-grid-inner > .a-fixed-left-grid-col.a-col-right')
for i in range(len(group_qa)):
question = group_qa[i].select('.a-fixed-left-grid-col.a-col-right > a> span')[0].text.strip('\n').lstrip().rstrip()
answer = group_qa[i].select('.a-fixed-left-grid.a-spacing-base .a-fixed-left-grid-col.a-col-right > span')[0].text
# f.write(question + '\t' + answer + '\n')
fq.write(question + '\n')
fa.write(answer + '\n')

def main(url, asin):
session = get_session()
# 获取当前的页数总数
pages = int(get_pages(session, url))
urlList = Queue() # 将要抓取的url存到queue中
for i in range(pages):
i += 1
furl = url.format(asin, i)
urlList.put(furl)
# 抓取QA
thread_list =
# 设置线程数
thread_count = 15
for _ in range(thread_count):
t = threading.Thread(target=get_qa, args=(session, urlList))
t.start()
thread_list.append(t)
for i in range(thread_count):
thread_list[i].join()

# 已废弃,原本想去除掉常用冠词和人称代词后再统计词频的
def get_most_count(TEXT):
for char in '\n\t.?-':
TEXT = TEXT.replace(char, ' ')
for char in ['the', 'I', 'to', 'you ', 'and ', 'a', 'these', 'it', 'they', 'with', 'have', 'can', 'be', 'at', 'of', 'are', 'them', 'Are']:
TEXT = TEXT.replace(char, '')
word_list = TEXT.split()
print(Counter(word_list).most_common())

if __name__ == "__main__":
os.chdir(sys.path[0])
### configuration ###
# 改asin
asin = 'B07DPJVN6P'
us = 'https://www.amazon.com/'
uk = 'https://www.amazon.co.uk/'
# 改站点
Marketplace = uk
baseurl = Marketplace + 'ask/questions/asin/{}/{}/ref=ask_dp_iaw_ql_hza?isAnswered=true'
fq = open('./questions.txt', 'w', encoding='utf-8') #存放question的文件夹
fa = open('./answers.txt', 'w', encoding='utf-8') #存放question的文件夹
start = time.time()
# 开始运行
main(baseurl, asin)
fq.close()
fa.close()
print('duration: %.2f' % (time.time() - start))
# 计算词频
# f = open('a.txt', 'r', encoding='utf-8')
# get_most_count(f.read())
# f.close()[/i][/i][/i]
已邀请:

要回复问题请先登录注册

x 点击咨询