郑州高端建站,计算机专业吃香吗,网站制作xiu021,商业空间设计概述免责声明
文章仅做经验分享用途#xff0c;利用本文章所提供的信息而造成的任何直接或者间接的后果及损失#xff0c;均由使用者本人负责#xff0c;作者不为此承担任何责任#xff0c;一旦造成后果请自行承担#xff01;#xff01;#xff01; import os import re i…免责声明
文章仅做经验分享用途利用本文章所提供的信息而造成的任何直接或者间接的后果及损失均由使用者本人负责作者不为此承担任何责任一旦造成后果请自行承担 import os import re import sys import requests sys.path.append() from os.path import join, exists from urllib.request import urlretrieve from bs4 import BeautifulSoup, Tag, NavigableString, Comment class Parser(object): def __init__(self, html,markdown_dir): self.html html self.soup BeautifulSoup(html, html.parser) self.outputs [] self.fig_dir markdown_dir self.pre False self.equ_inline False if not exists(self.fig_dir): os.makedirs(self.fig_dir) self.recursive(self.soup) def remove_comment(self, soup): if not hasattr(soup, children): return for c in soup.children: if isinstance(c, Comment): c.extract() self.remove_comment(c) def recursive(self, soup): if isinstance(soup, Comment): return elif isinstance(soup, NavigableString): for key, val in special_characters.items(): soup.string soup.string.replace(key, val) self.outputs.append(soup.string) elif isinstance(soup, Tag): tag soup.name if tag in [h1, h2, h3, h4, h5]: n int(tag[1]) soup.contents.insert(0, NavigableString(\n #*n )) soup.contents.append(NavigableString(\n)) elif tag a and href in soup.attrs: soup.contents.insert(0, NavigableString([)) soup.contents.append(NavigableString(]({}).format(soup.attrs[href]))) elif tag in [b, strong]: soup.contents.insert(0, NavigableString(**)) soup.contents.append(NavigableString(**)) elif tag in [em]: soup.contents.insert(0, NavigableString(*)) soup.contents.append(NavigableString(*)) elif tag pre: self.pre True elif tag in [code, tt]: if self.pre: if not class in soup.attrs: language bash else: for name in [cpp, bash, python, java]: if name in .join(list(soup.attrs[class])): # code classprism language-cpp language name soup.contents.insert(0, NavigableString(\n{}\n.format(language))) soup.contents.append(NavigableString(\n)) self.pre False # assume the contents of pre contain only one code else: soup.contents.insert(0, NavigableString()) soup.contents.append(NavigableString()) elif tag p: if soup.parent.name ! li: # print(soup.parent) soup.contents.insert(0, NavigableString(\n)) elif tag span: if class in soup.attrs: if (katex--inline in soup.attrs[class] or katex--display in soup.attrs[class]): ## inline math self.equ_inline True if katex--inline in soup.attrs[class] else False math_start_sign $ if self.equ_inline else \n\n$$ math_end_sign $ if self.equ_inline else $$\n\n equation soup.find_all(annotation, {encoding: application/x-tex})[0].string equation math_start_sign str(equation) math_end_sign self.outputs.append(equation) self.equ_inline False return elif tag in [ol, ul]: soup.contents.insert(0, NavigableString(\n)) soup.contents.append(NavigableString(\n)) elif tag in [li]: soup.contents.insert(0, NavigableString( )) elif tag img: src soup.attrs[src] # pattern r.*\.png pattern r(.*\..*\?)|(.*\.(png|jpeg|jpg)) result_tuple re.findall(pattern, src)[0] if result_tuple[0]: img_file result_tuple[0].split(/)[-1].rstrip(?) else: img_file result_tuple[1].split(/)[-1].rstrip(?) img_file join(self.fig_dir, img_file) urlretrieve(src, img_file) code .format(img_file, img_file) self.outputs.append(\n code \n) return if not hasattr(soup, children): return for child in soup.children: self.recursive(child) def html2md(url, md_file, with_titleFalse): response requests.get(url,headers header) soup BeautifulSoup(response.content, html.parser, from_encodingutf-8) html for child in soup.find_all(svg): child.extract() if with_title: for c in soup.find_all(div, {class: article-title-box}): html str(c) for c in soup.find_all(div, {id: content_views}): html str(c) parser Parser(html,markdown_dir) with open(md_file, w,encodingutf-8) as f: f.write({}\n.format(.join(parser.outputs))) def download_csdn_single_page(article_url, md_dir, with_titleTrue, pdf_dirpdf, to_pdfFalse): response requests.get(article_url,headers header) soup BeautifulSoup(response.content, html.parser, from_encodingutf-8) title soup.find_all(h1, {class: title-article})[0].string ## 使用 html 的 title 作为 md 文件名 title title.replace(*, ).strip().split() md_file md_dir/title[0] .md print(正在保存 Markdown File To {}.format(md_file)) html2md(article_url, md_file, with_titlewith_title) header { Accept: application/json, text/plain, */*, Accept-Language: zh-CN,zh;q0.9,en;q0.8,und;q0.7, Connection: keep-alive, Content-Type: application/json;charsetUTF-8, User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) Safari/537.36, x-requested-with: XMLHttpRequest } special_characters { lt;: , gt;: , nbsp: , #8203: , } if __name__ __main__: article_url input(str(输入需要保存的csdn博客链接)) #csdn博客链接 markdown_dir ./ #保存文件夹 download_csdn_single_page(article_url,markdown_dir) 结语
没有人规定一朵花一定要成长为向日葵或者玫瑰。