| import requestsfrom bs4 import BeautifulSoup
 
 text_file_name = "files/缠中说禅.txt"
 md_file_name = "files/缠中说禅.md"
 markdown_template = "files/markdown/template.md"
 markdown_post_dir = "files/markdown/posts/"
 
 
 def write_to_md_file(context):
 with open(md_file_name, "a+") as f:
 f.write(context)
 f.flush()
 
 
 def write_to_text(context):
 with open(text_file_name, "a+") as f:
 f.write(context)
 f.flush()
 
 
 def write_to_markdown_post(article_body_url, title_name, category, create_time, content_body):
 content_body = '。\n'.join(content_body.split("。"))
 title_name = title_name.replace("/", "-")
 with open(markdown_template) as f:
 lines = f.readlines()
 context_template = ''.join(lines)
 full_content_body = context_template.format(title_name,
 category,
 category,
 title_name,
 title_name,
 create_time,
 article_body_url,
 content_body)
 
 with open(markdown_post_dir + title_name + ".md", "w") as f:
 f.write(full_content_body)
 f.flush()
 
 
 def download_article_body(article_body_url):
 r = requests.get(article_body_url)
 soup = BeautifulSoup(str(r.content, 'utf-8'))
 title_name = soup.find(class_="articalTitle").find(class_="titName SG_txta").text.strip()
 create_time = soup.find(class_="articalTitle").find(class_="time SG_txtc").text.strip()
 category = "无分类"
 if soup.find(class_="blog_class").find('a') is not None:
 category = soup.find(class_="blog_class").find('a').text.strip()
 content_body = soup.find(class_="articalContent").text.strip()
 
 full_context = "\n{} - {} - {} \n{} \n".format(title_name, category, create_time, content_body)
 write_to_text(full_context)
 content_body_md = '。\n'.join(content_body.split("。"))
 md_full_context = "\n## {} \n### {} \n### {} \n {}".format(title_name, category, create_time, content_body_md)
 write_to_md_file(md_full_context)
 write_to_markdown_post(article_body_url, title_name, category, create_time, content_body)
 
 
 def worker(page_url):
 print("Starting process:", page_url)
 r = requests.get(page_url)
 r.raise_for_status()
 soup = BeautifulSoup(str(r.content, 'utf-8'))
 for article_list in soup.find_all(class_="atc_title"):
 article_body_url = article_list.find('a', href=True).attrs['href']
 download_article_body(article_body_url)
 
 if soup.find(class_="SG_pgprev") is None:
 print("This is first page.")
 
 if soup.find(class_="SG_pgnext") is not None:
 next_url = soup.find(class_="SG_pgnext").find('a', href=True).attrs['href']
 print("Fetch the next page:", next_url)
 worker(next_url)
 else:
 print("The is the last page.")
 print("End the worker:", page_url)
 
 
 if __name__ == '__main__':
 url = "http://blog.sina.com.cn/s/articlelist_1215172700_0_1.html"
 worker(url)
 print("All the worker is done.")
 
 |