1000字范文 > 使用selenium抓取华尔街见闻和新浪财经数据

使用selenium抓取华尔街见闻和新浪财经数据

时间：2019-10-07 04:55:14

# 新浪财经数据采集import requestsimport pymongoimport timefrom selenium import webdriverfrom bs4 import BeautifulSoup# from fake_useragent import UserAgent# ua_list = UserAgent()ua_list= 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/0101 Firefox/52.0'def get_hej_news():"""爬取华尔街见闻宏观新闻"""client = pymongo.MongoClient('localhost', 27017)news = client['news']hej_news = news['hej_news']chromedriver = r"/usr/local/share/chromedriver"driver = webdriver.Chrome(chromedriver)# 使用get()方法打开待抓取的URLdriver.get('/live/global')# 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep ：间歇滚动间距js = 'window.scrollBy(0,3000)'driver.execute_script(js)time.sleep(5)js = 'window.scrollBy(0,5000)'driver.execute_script(js)time.sleep(5)pages = driver.page_sourcesoup = BeautifulSoup(pages, 'html.parser')soup1 = soup.find('div', class_='livenews')content = soup1.find_all('div', class_='live-item')for i in content:new_time = i.find('span', attrs={'class': 'live-item__time__text'}).get_text(),news = i.find('div', attrs={'class': 'content-html'}).get_text().strip().replace('\n|//', '')isexit = hej_news.count({'new_time': new_time})if isexit != 0:hej_news.remove({'new_time': new_time})data = {'new_time': new_time,'news': news}hej_news.insert_one(data)driver.close()driver.quit()print('存储华尔街见闻宏观新闻成功')def get_xlcj_news():"""爬取新浪财经突发live板块新闻"""client = pymongo.MongoClient('localhost', 27017)news = client['news']xlcj_news = news['xlcj_news']num = 1while num < 7:chromedriver = r"/usr/local/share/chromedriver"driver = webdriver.Chrome(chromedriver)url = '/zt/app_zt/f/v/finance/globalnews1/?page=' + str(num)# 使用get()方法打开待抓取的URLdriver.get(url)# 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep ：间歇滚动间距js = 'window.scrollBy(0,3000)'driver.execute_script(js)time.sleep(5)js = 'window.scrollBy(0,5000)'driver.execute_script(js)time.sleep(5)pages = driver.page_sourcesoup = BeautifulSoup(pages, 'html.parser')soup1 = soup.find('div', class_='bd_list')content = soup1.find_all('div', class_='bd_i_og')num += 1for i in content:news_time = i.find('p', attrs={'class': 'bd_i_time_c'}).get_text().strip()news_type = i.find('p', attrs={'class': 'bd_i_tags'}).get_text().strip().replace("\n", "")news = i.find('p', attrs={'class': 'bd_i_txt_c'}).get_text()print(news_time,news_type,news)isexit = xlcj_news.count({'news_time': news_time})if isexit != 0:xlcj_news.remove({'news_time': news_time})data = {'news_time': news_time,'news_type': news_type,'news': news}xlcj_news.insert_one(data)driver.close()driver.quit()print('新浪财经突发live板块新闻存储成功')def main():# his_time = input('请输入要查询的新闻时间(格式：-11-2 00:00:00)：')# history_time = str(time.mktime(time.strptime(his_time, '%Y-%m-%d %H:%M:%S'))).replace('.0', '')get_hej_news()get_xlcj_news()if __name__ == '__main__':main()复制代码

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。