程序员三大美德之一:快速有效检索网页数据的“懒惰”程序员指南
import timeit import requests from bs4 importBeautifulSoup import json import re repeat =5 number =5 defweb_scrape_1(): r = requests.get(f https://finance.yahoo.com/quote/FB?p=FB ) soup =BeautifulSoup(r.text, "lxml") price = soup.find( div , { class : My(6px) Pos(r)smartphone_Mt(6px) }).find( span ).text returnf the current price: {price} deflazy_1(): r = requests.get( https://query2.finance.yahoo.com/v10/finance/quoteSummary/FB?modules=price ) data = r.json() returnf"the currentprice: {data[ quoteSummary ][ result ][0][ price ][ regularMarketPrice ][ raw ]}" defweb_scrape_2(): p = re.compile(r root.App.main = (.*); ) ticker = AGL.AX results = {} with requests.Session() as s: r = s.get( https://finance.yahoo.com/quote/{}/key-statistics?p={} .format(ticker,ticker)) data = json.loads(p.findall(r.text)[0]) key_stats = data[ context ][ dispatcher ][ stores ][ QuoteSummaryStore ] res = { Enterprise Value : key_stats[ defaultKeyStatistics ][ enterpriseValue ][ fmt ], Shares Short : key_stats[ defaultKeyStatistics ][ sharesShort ].get( longFmt , N/A ) } results[ticker] = res return results deflazy_2(): r = requests.get( https://query2.finance.yahoo.com/v10/finance/quoteSummary/AGL.AX?modules=defaultKeyStatistics ) data = r.json() return { AGL.AX : { Enterprise Value : data[ quoteSummary ][ result ][0][ defaultKeyStatistics ][ enterpriseValue ][ fmt ], Shares Short : data[ quoteSummary ][ result ][0][ defaultKeyStatistics ][ sharesShort ].get( longFmt , N/A ) } } web_scraping_1_times = timeit.repeat( web_scrape_1() , setup= import requests; from bs4 import BeautifulSoup , globals=globals(), repeat=repeat, number=number) print(f web scraping #1min time is {min(web_scraping_1_times) / number} ) lazy_1_times = timeit.repeat( lazy_1() , setup= import requests , globals=globals(), repeat=repeat, number=number ) print(f lazy #1 min timeis {min(lazy_1_times) / number} ) web_scraping_2_times = timeit.repeat( web_scrape_2() , setup= import requests, re, json , globals=globals(), repeat=repeat, number=number) print(f web scraping #2min time is {min(web_scraping_2_times) / number} ) lazy_2_times = timeit.repeat( lazy_2() , setup= import requests , globals=globals(), repeat=repeat, number=number ) print(f lazy #2 min timeis {min(lazy_2_times) / number} ) web scraping #1 min time is 0.5678426799999997 lazy #1 min time is 0.11238783999999953 web scraping #2 min time is 0.3731000199999997 lazy #2 min time is 0.0864451399999993 “懒惰”的替代方案比其网页抓取同类产品快4到5倍! “偷懒”的过程 (编辑:应用网_阳江站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |