网站建设新手看什么书,建设网站具体的步骤,手机微信一体网站建设,网页布局设计框架图表参考博客#xff1a;基于Python知乎回答爬虫 jieba关键字统计可视化_知乎爬虫搜索关键词_菠萝柚王子的博客-CSDN博客 1、安装依赖包
import numpy
import requests
import certifi
from PIL import Image
from lxml import etree
import jieba
from wordcloud import WordClo…参考博客基于Python知乎回答爬虫 jieba关键字统计可视化_知乎爬虫搜索关键词_菠萝柚王子的博客-CSDN博客 1、安装依赖包
import numpy
import requests
import certifi
from PIL import Image
from lxml import etree
import jieba
from wordcloud import WordCloud
手动安装插件
1、下载插件包解压到路径Python3\Lib\site-packages
2、进入插件包执行 python setup.py install命令进行安装 2、爬取问题答案
def fetch_text():headers {User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0,Accept: text/html,application/xhtmlxml,application/xml;q0.9,image/avif,image/webp,*/*;q0.8,Cookie: _zap45ffaab9-c328-4843-b252-1f521d538595; _xsrf0c0fc9f1-b57a-43f0-a9c1-c95ec11e59fb; d_c0AKBYoDTQaRaPTuBvfd2jHmPLHLky7s_fjJ4|1677838132; \KLBRSIDca494ee5d16b14b649673c122ff27291|1677897534|1677897456; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c491677838134; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c491677897488; \ captcha_session_v22|1:0|10:1677897488|18:captcha_session_v2|88: \WjlkMlJMeWNSUGQzV1M1ZzVqYVJqR0NUOGVyS1JBWlovZnYxRUVudEN6VnNUc0pVZWhwZGxsZStjWTBJd1Z2Zw|8c57cc723eb09a7831a22c18cf6a4bcf1b906cf377457432e4abf75a67fd3e5c; \gdxidpyhxdE8eQjbxDfuEq18dZi20NAsNH%5C6YR%2Fe0ojcGKE%2BSzWTMZ%2F4fn2DbAIOK%2FdTeKrJMVjfZhwRN3Hm00KqXIMm4RMgN4qJ7sPKWI0gl07p3C6tT9oipWWGlnI7mIQDtqrL8M%2BkSc5z4mdOzT7LOluNpqStLP9r \PqEgwypqOf7HPppLG4Kvn28%3A1677898391211; YD00517437729195%3AWM_NIWpDYVR1YaOo%2FKnQBpVcPHYko%2F6Rhxi%2FZOqrVf9HcRCbMlsl5heAV5MD5J9tx0mLPUjUPiRx2iTB%2BQUsKsPZmeYsEE5gYbKckD4EFCv \060QUMYmbK7IXwRpdNAwcrOKhvZ3I%3D; YD00517437729195%3AWM_NIKE9ca17ae2e6ffcda170e2e6ee98d44db69fa18ebc4d92e78ea3c54f978a9eb0d46f8fb8a197ea6d89b8a4a5f82af0fea7c3b92a95b08cbad55 \af387afb0ee2589b29fa3f35ea19088baea65edea9e89fb63ed868192f15ea6e78884d149a293a494ef3facbb8fb7b84788eb86daf774f4b384b1d8629790a7b3fb3aab8c8eccd45fb195bdb1d967a787bda6c83fa5 \87aeace26188ec8ba7c27ef4ea00b2d8479bb3aeacb6439ca99e91e74683a9ad93ce41acaa96d4cc37e2a3; YD00517437729195%3AWM_TIDthaY088YLfdBVERQQRbVLI09rkwOhxWv; \ariawapChangeViewPortfalse; ariaFixedtrue; ariaReadtype1; ariaoldFixedStatusfalse; ariaStatusfalse}url https://www.zhihu.com/question/308447090 # https://www.zhihu.com/question/308447090response requests.get(url, headersheaders, verifyFalse)html etree.HTML(response.text)title html.xpath(//h1[classQuestionHeader-title])[0].textkeyword html.xpath(//div/meta[itempropkeywords])[0].get(content)print(keyword)aheader {User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0,Accept: */*,Accept-Language: en-US,en;q0.5,Accept-Encoding: gzip, deflate, br,Referer: https://www.zhihu.com/question/308447090,,Cookie: _zap45ffaab9-c328-4843-b252-1f521d538595; _xsrf0c0fc9f1-b57a-43f0-a9c1-c95ec11e59fb; d_c0AKBYoDTQaRaPTuBvfd2jHmPLHLky7s_fjJ4|1677838132; \KLBRSID81978cf28cf03c58e07f705c156aa833|1677922162|1677922052; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c491677838134; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c491677922160; \captcha_session_v22|1:0|10:1677922161|18:captcha_session_v2|88:RzJCaGUwdWdQNXFxTGY4dHpSbVpvQVF4aHJ2Uk90SDRONVg5dUk0VmF0cWVaWnR5S1dJZnVOeDFIK3pZSVJkZg| \8a6bd71d3cb5982308db4766953216ff6683fd1b7bd67324b7c55b1891fccaa0; gdxidpyhxdEmJsIftPnWfKT9Vw8E082q719gmcIQISluGTpMDV%2BTocfs92oLE%2BujV6Xl%2FIQYxeK980UdIiYBxr7nrgP2Vv%2Bnv \LmqAWPy27YaL%2BmS9RRjQQydRA7cjoD8M%5Cf8kbaB1nMIMX45%5Cp5I48E2EjZiAOaPXbci9gI88A8r8qjVO%2BL3ohXe0lB%3A1677922765564; YD00517437729195%3AWM_NIruHLHxPBw1sq%2BFlGPhpu8bFoOhP%2BlZAhWu9 \SimALgSDRgGW1rv9hl15B51cYlxxaY2cI87hRYbU3SXvKgMSeOBV8E%2FnFqi4unZOayj5qgj%2BGOV%2FDPh67LewEEONTU7%2BWaHQ%3D; YD00517437729195%3AWM_NIKE9ca17ae2e6ffcda170e2e6eebadc34a9869c9be83f8de78a \a3d44e839e8facd8418fedafa5b740a399f9d2e92af0fea7c3b92ab2ab88adcb74949eb9d2f27be99aa4a8d364b4abf883c5658db8a0ccef5283b89996bc59abb38fade659888ea9a8d63bb2aaf789ef3bf79efbd2e253b0a8b882d \c3aa197f8d8eb5a9bbe9bd4d254b695a9aed26f8d9e9ad9c825ae9abf8ab77fae868182fc62f7afa98baa4d8d8ae1a5d364f6e98486f64f879182b5d83de989afa9d437e2a3; YD00517437729195%3AWM_TIDthaY088YLfdBVERQ \QRbVLI09rkwOhxWv; ariawapChangeViewPortfalse; ariaFixedtrue; ariaReadtype1; ariaoldFixedStatusfalse; ariaStatusfalse}question_id 308447090 # 知乎问题idinterval 5 # 一页html答案的数量offset 0end 1202 # 回答数i 1file open(anwsers.txt, w, encodingutf8)while True:aurl fhttps://www.zhihu.com/api/v4/questions/{question_id}/feeds?includecontentlimit{interval}offset{offset}orderdefaultaresponse requests.get(aurl, headersaheader, verifyFalse)# print(aresponse.text)anws aresponse.json()[data]print(len(anws))for anw in anws:content anw[target][content]excerpt anw[target][excerpt]file.write(anwser start~~~ str(i) ~~~\n)file.write(content \n)file.write(excerpt \n)file.write(anwser end~~~ str(i) ~~~\n)i 1offset intervalif offset end:print(结束)breakfile.close()
3、 截取分析答案
#判断是否为汉字
def is_han(text):return all(\u4e00 char \u9fff for char in text)
def count_worlds():text open(./anwsers.txt,r,encodingutf-8).read()word_count {}text_list jieba.lcut(text, cut_allTrue)new_text_list []for content in text_list:if len(content) 1:continueif is_han(content):#print(content)word_count[content] word_count.get(content, 0) 1new_text_list.append(content)generate_text .join(new_text_list)sort_txt sorted(list(word_count.items()), key lambda a: a[1] ,reverseTrue)print(sort_txt)mask_pic numpy.array(Image.open(1.png))#打开背景图片wordcloud WordCloud(font_pathrC:/Windows/Fonts/STSONG.TTF,collocationsFalse,max_words100,min_font_size10,max_font_size500,maskmask_pic).generate(generate_text)image wordcloud.to_image()# image.show()wordcloud.to_file(result.png) # 把词云保存下来