psycopg2是PostgreSQL非常好的Python Client Library, 本文通过爬取上市公司的基础数据来演示如何使用
表结构
create table if not exists company(
symbol varchar(32), --股票代码
name varchar(128), --公司名称
ceo varchar(64), --法人代表,
capitol varchar(64), --注册资金
addr varchar(128), --注册地址
establish_date date, --成立日期
listing_date date, --上市日期,
scope text, -- 经营范围
profile text, --公司简介
primary key(symbol)
);
数据爬取
python package安装
pip install selenium geckodriver psycopg2 pandas
东方财富提供的上市公司数据很齐全,下面的代码演示如何将 公司简介 这张表中的数据抽取出来并存入到数据库中。
## filename: stock_web_crawler.py
import akshare as ak
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re
from selenium.webdriver.firefox.options import Options as FirefoxOptions
import psycopg2
import pandas as pd
options = webdriver.FirefoxOptions()
options.headless = True
f_df = ak.stock_zh_a_spot_em()
stock_code_df = f_df['代码']
insert_sql = """insert into company(symbol, name, ceo, addr, capitol, establish_date, listing_date, scope, profile) values(%(symbol)s, %(name)s, %(ceo)s, %(addr)s, %(capitol)s, %(establish_date)s, %(listing_date)s,%(scope)s,%(profile)s)"""
pg_conn_str = "host='localhost' port='5434' user='dba' dbname='postgres'"
conn = psycopg2.connect(pg_conn_str)
for i in stock_code_df.sort_values().head(10):
driver = webdriver.Firefox(options=options)
driver.get("http://data.eastmoney.com/stockdata/{}.html".format(i))
html_text = driver.page_source
dfs = pd.read_html(html_text)
co_df = dfs[32]
with conn.cursor() as cur:
stock_dict=({"symbol": i, "name":co_df.iloc[0][1],"ceo":co_df.iloc[0][3],
'addr':co_df.iloc[1][1], 'capitol': co_df.iloc[1][3],
'establish_date':co_df.iloc[2][1], 'listing_date':co_df.iloc[2][3],
'scope':co_df.iloc[3][1],'profile':co_df.iloc[4][1]},)
cur.executemany(insert_sql, stock_dict)
conn.commit()
driver.close()
运行上述脚本
python stock_web_crawler.py
python版本
Python 3.9.1 (default, Dec 11 2020, 14:32:07)
[GCC 7.3.0] :: Anaconda, Inc. on linux
Type "help", "copyright", "credits" or "license" for more information.