Commit fa424284 by jscat

titan: 新浪突发数据抓取v1 初版本

parent cd67cbba
# encoding=utf-8
import MySQLdb
import time
import numpy as np
import collections
import csv
import string
import urllib
import urllib2
import json
import time
import MySQLdb
import datetime
import logging
import re
import sys, os
reload(sys)
sys.setdefaultencoding('utf8')
import math
#str1=r"平安银行(000001)2016-10-11融资融券信息显示,千红制药\*ST常林融资余额3,158,497,830元,龙江交通63,234,872\*ST雪博元,乐视网融资买入额天赐材料苏州固得68,490,198元,宝鹰股份融资偿苏州固锝还额65,808,891元,融资净买额2,681,307元,融券余量6,910,915股,融券卖出量18,000股,融券偿还量100股,融资融券余额3,221,732,702元。平安银行融资融券详细信息如下表"
csvfile = open('stocknamelist.csv', "rb")
reader = csv.reader(csvfile)
stocknamelist= list(reader)
namecodedict={}
restr=''
for stock in stocknamelist:
namecodedict[stock[1]]=unicode(stock[0])
restr+=(stock[1])
restr+='|'
restr=restr.lstrip(' ').rstrip('|')
def methodre(str1):
result=set(re.findall(restr,str1))
abstocklist=[]
for item in result:
tempstr = '\\'
if '*' in item:
tempstr+=item
abstocklist.append(namecodedict[tempstr])
else:
abstocklist.append(namecodedict[item])
return abstocklist
def methodkmp():
s = r""
s.index()
# encoding=utf-8
import requests
import json
import urllib
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def http_post_request(url, params,add_to_headers=None):
headers = {
"Accept": "application/json",
'Content-Type': 'application/x-www-form-urlencoded'
}
if add_to_headers:
headers.update(add_to_headers)
postdata = json.dumps(params,ensure_ascii=False)
print postdata
response = requests.post(url, data=params, headers=headers, timeout=5,verify=False)
try:
if response.status_code == 200:
print json.dumps(response.json(), encoding="UTF-8", ensure_ascii=False)
return response.json()
else:
return json.dumps({},ensure_ascii=False)
except BaseException as e:
print("httpPost failed, detail is:%s,%s" % (response.text, e))
return
\ No newline at end of file
# encoding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import settings
import smtplib
from email.mime.text import MIMEText
def send_email(subject, content):
msg = MIMEText(content)
msg['Subject'] = subject
msg['From'] = settings.msg_from
msg['To'] = settings.msg_to
try:
# s = smtplib.SMTP_SSL("smtp.163.com",465)
s = smtplib.SMTP(settings.client_smtp_url, settings.client_smtp_port)
s.login(settings.msg_from, settings.passwd)
s.sendmail(settings.msg_from, settings.msg_from, msg.as_string())
print ("发送成功")
except smtplib.SMTPException as e:
print ("发送失败")
finally:
s.quit()
if __name__ == '__main__':
subject = "python邮件测试"
content = "这是我使用python smtplib及email模块发送的邮件"
settings.logger.warn('this is a warn')
settings.logger.error('this is an error')
settings.logger.critical('this is a critical')
#send_email(subject,content)
\ No newline at end of file
#coding=utf-8
import urllib
import urllib2
import json
import time
import datetime
import time
import MySQLdb
import datetime
import logging
import sys,os
import csv
import rdcalendar
reload(sys)
sys.setdefaultencoding('utf8')
def stat_calcprofit(ndays):
# type: (object) -> object
# start = time.time()
calenarray = rdcalendar.rdcalen()
print calenarray
timenow = datetime.date.today()
str_time = str(timenow)
list_time = ''.join(str_time.split('-'))
try:
num = calenarray.index(list_time)
except ValueError, e:
print "Today is not a tradeday!"
return
pretime = calenarray[num + ndays]
try:
conn = MySQLdb.connect(
host='10.25.24.52',
port=3306,
user='nlp',
passwd='123456',
db='nlp',
charset="utf8"
)
cur = conn.cursor()
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
strSql = "SELECT SUM(sell_price-buy_price) FROM view_match_news_stock WHERE TO_DAYS(%s)=TO_DAYS(news_time) AND newsUptrendMan=1 AND buy_price > 0.0 AND sell_price>0.0" % pretime
print strSql
cur.execute(strSql)
conn.commit()
result = cur.fetchall()
profit = list(result)[0][0]
if profit is None:
profit = 0
print profit
strSql = "SELECT COUNT(DISTINCT(news_id)) FROM view_match_news_stock WHERE TO_DAYS(%s)=TO_DAYS(news_time) AND newsUptrendMan=1 AND buy_price > 0.0 AND sell_price>0.0" % pretime
print strSql
cur.execute(strSql)
conn.commit()
result = cur.fetchall()
numNews = list(result)[0][0]
print numNews
strSql = "SELECT COUNT(DISTINCT(stock_code)) FROM view_match_news_stock WHERE TO_DAYS(%s)=TO_DAYS(news_time) AND newsUptrendMan=1 AND buy_price > 0.0 AND sell_price>0.0" % pretime
print strSql
cur.execute(strSql)
conn.commit()
result = cur.fetchall()
numStock = list(result)[0][0]
print numStock
now = time.localtime()
str_time = time.strftime('%Y-%m-%d', now)
sqlCount = "SELECT COUNT(*) FROM tbl_stats WHERE DATE_FORMAT(create_datetime, \"%Y-%m-%d\")=DATE_FORMAT(\""+str(str_time)+"\", \"%Y-%m-%d\")"
print sqlCount
#判断是否已存在
cur.execute(sqlCount)
conn.commit()
result = cur.fetchall()
numCount = list(result)[0][0]+0
print numCount
# print "matchid:" + str(matchid)
# print "sellprice: " + str(sellprice)
# print "stockcode: " + str(stockcode)
strSql = ""
if numCount == 0:
strSql = "insert into tbl_stats(profit, num_news, num_stocks, create_datetime) VALUES (%f,%d, %d, '%s')" % (profit, numNews, numStock, str_time)
else:
strSql = "update tbl_stats set profit=\""+str(profit)+"\",num_news=\""+str(numNews)+"\",num_stocks=\""+str(numStock)+\
"\" WHERE DATE_FORMAT(create_datetime, \"%Y-%m-%d\")=DATE_FORMAT(\""+str(str_time)+"\", \"%Y-%m-%d\")"
print strSql
cur.execute(strSql)
conn.commit()
print "success"
ndays = 2
stat_calcprofit(ndays)
# encoding=utf-8
import numpy as np
import collections
import csv
import string
import sys
import time
reload(sys)
import math
import datetime
sys.setdefaultencoding('utf-8')
def rdcalen():
calendarfile='trdDateFrom2005.csv'
csvfile = open(calendarfile, "rb")
reader=csv.reader(csvfile)
calenlist=list(reader)
#print calenlist[0:200]
calenarray=[]
for item in calenlist:
calenarray.append(item[0])
return calenarray
import urllib
import urllib2
import json
import time
import MySQLdb
import datetime
import logging
import sys,os
import csv
reload(sys)
sys.setdefaultencoding('utf8')
def getprice(stockcode,N):
code=stockcode
sec = "sh"
if int(code) >= 600000:
sec = "sh"
else:
sec = "sz"
url = "http://hq.sinajs.cn/list=" + sec + code
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
headers = {
'Referer': 'http://finance.sina.com.cn/',
'User-Agent': user_agent
}
post_data = {
}
data = urllib.urlencode(post_data)
req = urllib2.Request(
url=url,
data="",
headers=headers
)
try:
result = urllib2.urlopen(req)
except:
print "urlopen doesn't work, sina API not work"
exit()
text = result.read()
textlist = text.split('"')
stockinfo = textlist[1].split(',')
return stockinfo[N]
# encoding=utf-8
import os
import logging
import sys
'''
logger settings
'''
_DEBUG=True
#####################
# 核心os.name
# windows=nt
# linux=posix
# default是linux系统
#
# 判断hostip
#####################
loggingFile = "sina.log"
hostname = "47.99.110.89"
if (os.name == "nt"):
hostname = "localhost"
else:
loggingFile = "/home/nlp/nlp/crawler_sina_breaking/sina.log"
hostname = "47.99.110.89"
server = "47.99.110.89"
logging.basicConfig(filename=loggingFile,filemode="a+",format="%(asctime)s-%(name)s-%(levelname)s-%(message)s",level=logging.INFO);
logger = logging.getLogger("log_sina");
'''
api interface
'''
url_jieba = "http://"+str(server)+":18080/api/rest/nlp/extractTags"
url_simTest = "http://"+str(server)+":10090/api/rest/nlp/matchscore"
url_sentosen = "http://"+str(server)+":10094/api/rest/nlp/sentosen"
'''
email settings
'''
msg_from = 'tongzhi@digjob.net'
passwd = '000000'
msg_to = 'tongzhi@digjob.net'
ssl = 0
client_smtp_url = "smtp.ym.163.com"
client_smtp_port = "25"
def func_name():
return str(sys._getframe().f_code.co_name)
if __name__ == '__main__':
func_name()
# coding=utf-8
import sys
import MySQLdb
import datetime
import time
import searchprice
import rdcalendar
reload(sys)
sys.setdefaultencoding('utf8')
def wr_sellprice():
# start = time.time()
calenarray=rdcalendar.rdcalen()
print calenarray
timenow = datetime.date.today()
str_time = str(timenow)
list_time = ''.join(str_time.split('-'))
try:
num=calenarray.index(list_time)
except ValueError,e:
print "Today is not a tradeday!"
return
pretime=calenarray[num+2]
try:
conn = MySQLdb.connect(
host='10.25.24.52',
port=3306,
user='nlp',
passwd='123456',
db='nlp',
charset="utf8"
)
cur = conn.cursor()
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
cur.execute(
"select * from tbl_match where news_fk_id in (select id from tbl_news where TO_DAYS(news_time)=TO_DAYS(%s) and newsUptrendMan=1) and buy_price is not null" %pretime)
conn.commit()
result = cur.fetchall()
matchlist = list(result)
selldict = {}
for matchitem in matchlist:
stockid1 = matchitem[2]
cur.execute("select stock_code from tbl_stock where id=%s" % stockid1)
conn.commit()
result = cur.fetchone()
stockcode = "".join(result)
if selldict.has_key(stockcode):
sellprice = (float)(selldict.get(stockcode))
else:
try:
sellprice = (float)(searchprice.getprice(stockcode, 1))
selldict[stockcode] = sellprice
except:
sellprice = 0
matchid = matchitem[0]
now = time.localtime()
str_time = time.strftime('%Y-%m-%d %H:%M:%S', now)
#print "matchid:" + str(matchid)
#print "sellprice: " + str(sellprice)
#print "stockcode: " + str(stockcode)
cur.execute(
"Update tbl_match set sell_price=%f,sell_datetime='%s' where id=%s " % (sellprice, str_time, matchid))
conn.commit()
result = cur.fetchone()
wr_sellprice()
CREATE DATABASE IF NOT EXISTS nlp DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
\ No newline at end of file
use nlp;
#centos7
set global validate_password_policy=0;
set global validate_password_length=6;
grant all privileges on nlp.* to nlp@'%' identified by '123456';
grant all privileges on nlp.* to nlp@'localhost' identified by '123456';
grant all privileges on nlp.* to nlp@'127.0.0.1' identified by '123456';
flush privileges;
/*==============================================================*/
/* DBMS name: MySQL 5.0 */
/* Created on: 2020/3/30 15:36:29 */
/*==============================================================*/
drop table if exists tbl_match;
drop table if exists tbl_news;
drop table if exists tbl_stock;
/*==============================================================*/
/* Table: tbl_match */
/*==============================================================*/
create table tbl_match
(
id int(11) not null auto_increment comment 'id',
news_id int(11) not null comment '新闻id',
stock_id int(11) comment '股票id',
match_score varchar(20) comment '相似度得分',
create_datetime DATETIME NOT NULL DEFAULT '0000-00-00 00:00:00',
update_datetime DATETIME DEFAULT NULL,
primary key (id),
query_key key (stock_id, match_score)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='匹配表';
/*==============================================================*/
/* Table: tbl_news */
/*==============================================================*/
create table tbl_news
(
id int(11) not null auto_increment comment 'id',
news_title varchar(200) not null comment '新闻标题',
news_content varchar(2000) comment '新闻内容',
news_source varchar(200) comment '新闻源头',
news_segment varchar(200) comment '新闻关键词列表',
news_time DATETIME DEFAULT NULL COMMENT '新闻事件时间',
news_keyword_list varchar(200) comment '新闻关键词列表',
word_weight_list varchar(200) comment '新闻权重列表',
`create_datetime` DATETIME NOT NULL DEFAULT '0000-00-00 00:00:00' COMMENT '创建时间',
`update_datetime` DATETIME DEFAULT NULL COMMENT '更新时间',
primary key (id),
key query_key (news_title)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='新闻表';
/*==============================================================*/
/* Table: tbl_stock */
/*==============================================================*/
create table tbl_stock
(
id int(11) not null auto_increment comment 'id',
stock_code varchar(200) comment '股票代码',
stock_url varchar(200) comment '股票链接',
stock_keyword_list varchar(200) comment '股票关键词列表',
version varchar(10) comment '版本号',
create_datetime timestamp comment '创建时间',
primary key (id),
key query_key (stock_code)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='股票表';
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论