﻿# encoding=utf-8
import urllib
import json
import time
import pymysql
import datetime
import settings
import sys,os
import searchprice
import abstock
import re
import math
import chardet
import subprocess
import requests
from apiUtil import *


#####################
# 核心os.name
# windows=nt
# linux=posix
# default是linux系统
#
# 判断hostip
#####################

#判断是否位于交易时间段，若是则查询并写入buyprice，否则不查询
def judgetradetime():
    nowtime=datetime.datetime.now()
    # 得到当前日期2016-09-09
    d1 = datetime.date.today()
    tm = datetime.time(9, 30, 0)
    tn = datetime.time(11, 30, 0)
    ta = datetime.time(13, 0, 0)
    tc = datetime.time(15, 0, 0)

    # 上午开盘时间opentime，noonclotime，afteroptime，closetime
    opentime = datetime.datetime.combine(d1, tm)
    noonclotime = datetime.datetime.combine(d1, tn)
    afteroptime = datetime.datetime.combine(d1, ta)
    closetime = datetime.datetime.combine(d1, tc)

    if ((opentime<=nowtime<=noonclotime) or (afteroptime<=nowtime<=closetime)):
        return 1
    else:
        return 0

#判断linux后台是否有pySinaBreking.py在运行，如果没有运行，则启动
#否则结束
def checkRunning():
    command = "/bin/ps -ef |grep python |grep pySinaBreking |wc -l ";
    print("command: "+command)
    output = subprocess.Popen([command],stdout=subprocess.PIPE,shell=True).communicate()
    #root     11845 10310  0 18:37 pts/1    00:00:00 python pySinaBreking.py\n
    #root     11846 11845  0 18:37 pts/1    00:00:00 /bin/sh -c /bin/ps -ef |grep python |grep pySinaBreking \n
    #need to substract 2 
    return int(output[0])-2

ret = 0
if os.name != "nt":
    ret = checkRunning()
if(ret > 0):
    print("pySinaBreking is running, no need to run\n")
    settings.logger.info("pySinaBreking is running, no need to run");
    exit()

conn= pymysql.connect(
        host=settings.hostname,
        port = 3306,
        user='nlp',
        passwd='123456',
        database ='nlp',
        charset="utf8"
        )
cur = conn.cursor()
  
url = 'http://zhibo.sina.com.cn/api/zhibo/feed?&page=1&page_size=20&zhibo_id=152&tag_id=0&dire=f&dpc=1&_=1538976698670'

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
headers = {
	'Accept': '*/*',
	'Accept-Encoding': 'gzip, deflate',
	'Accept-Language': 'zh-CN,zh;q=0.9',
	'Connection': 'close',
	# 'Content-Length': '100',
	'Content-Type': 'application/json',
	'Host': 'zhibo.sina.com.cn',
	'Referer': 'http://finance.sina.com.cn/7x24/',
	'User-Agent': user_agent,

}

data = json.dumps({
	'page':1,
	'page_size':'20',
	'zhibo_id':'152',
					 })   
req = requests.get(url, headers=headers,data=data)
try:
    result = req.text
    settings.logger.info("urlopen work, sina API works")
except:
    print("urlopen doesn't work, sina API not work")
    settings.logger.error("urlopen doesn't work, sina API not work")
    exit()
#print text;
settings.logger.info("print text"+str(result))
hjson = json.loads(result)


#得到数据库里的最新时间；
cur.execute("select max(news_time) from tbl_news")
str_latest = cur.fetchone()

if(str_latest[0] is None ):
    x = time.localtime(float("1180759620.859"))
    str_time = time.strftime('%Y-%m-%d %H:%M:%S',x)
    b=time.strptime(str_time,'%Y-%m-%d %H:%M:%S')
    ref_datetime=datetime.datetime(*b[:6]);    
    print("str_time is: "+str(str_time))
else:
    ref_datetime=str_latest[0]
if settings._DEBUG==True:
    print("ref_datetime is: "+str(ref_datetime))
settings.logger.info("print ref_datetime"+str(ref_datetime))

#---------------------------****************************************--------------------------------------------#
#小程序html转义字符处理�?0170306 by 张欣�?
#字符	十进�?转义字符
#"	&#34;	&quot;
#&	&#38;	&amp;
#<	&#60;	&lt;
#>	&#62;	&gt;
#替换常用HTML字符实体
#使用正常的字符替换html中特殊的字符实体�?
#@param htmlstr HTML字符�?
def replaceCharEntity(htmlstr):
    CHAR_ENTITIES={'nbsp':',','160':' ',
                   'lt':'<','60':'<',
                   'gt':'>','62':'>',
                    'amp':'&','38':'&',
                    'quot':'"','34':'"',
                   }
    re_charEntity=re.compile(r'&#?(?P<name>\w+);')
    sz=re_charEntity.search(htmlstr)
    while sz:
        entity=sz.group() #entity 全称，如>
        key=sz.group('name') #去除&；后entity,�?为gt
        try:
       #     htmlstr=re_charEntity.sub(CHAR_ENTITIES[key],htmlstr,1)
            htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 0)
            sz=re_charEntity.search(htmlstr)
        except KeyError:
            settings.logger.error("replaceCharEntity not work")
            #以空串代�?
            htmlstr=re_charEntity.sub(' ',htmlstr,1)
            sz=re_charEntity.search(htmlstr)
    return htmlstr
#---------------------------****************************************--------------------------------------------#

settings.logger.info("print before loop")
tweets =hjson['result']['data']['feed']
sina_tw = tweets.get('list')
for tweet in sina_tw:
    print("enter")
    str_time = tweet.get('create_time')
    str_content = tweet.get('rich_text').encode('utf8')
    # remove "\n" in str_content
    str_content = str(str_content).replace("\n","")
    settings.logger.info("print str_content before" + str(str_content))

#-------------------*****************************-------------------------------------#
    #小程序专属模�?
    #去掉新闻信息里的html标签，by张欣�?2017-02-21
    #参考http://www.jb51.net/article/65497.htm
    dr = re.compile(r'<[^>]+>', re.S)
    str_content = dr.sub('', str_content)
    #调用replaceCharEntity 函数处理html转义标签，quot，amp�?by 张欣�?2017-03-06
    str_content = replaceCharEntity(str_content)
#-------------------*******************************------------------------------------#

    settings.logger.info("print str_content after " + str(str_content))

    #print hjson['data'][i]['fund_id'],hjson['data'][i]['fund_short_name'],hjson['data'][i]['ret_ytd'],hjson['data'][i]['ret_incep']
    str_output = str(str_time)+","+str_content+'\n'
    #输出
    #print str_output

    settings.logger.info("print before extract api")
    # 1 调用restful接口进行中文分词的切�?
    post_data={
               'content': str_content,
               'topK': 10,
               'pos': 'ns,n'
               }
    print("jieba: "+str(settings.url_jieba))
    try:
        data = http_post_request(settings.url_jieba, post_data)
        settings.logger.info("urlopen work, extractTags API works"+str(str_content))
    except:
        print("urlopen doesn't work, extractTags API not work")
        settings.logging.error("pySinaBreking: extractTags API not work" + str(str_content))
        continue
    str_keyword = " ".join(data['result']['tags'])
    str_weights = " ".join([str(i) for i in data['result']['weights']])


    #日期格式计算
    b=time.strptime(str_time,'%Y-%m-%d %H:%M:%S')
    cur_datetime=datetime.datetime(*b[:6])
    if settings._DEBUG==True:
        print("cur_datetime is : "+str(cur_datetime))

    #写入到tbl_news数据�?
    if(cur_datetime <= ref_datetime ):
        settings.logging.info("pySinaBreking: old news, no need to process")
    else:
        if settings._DEBUG==True:
            print("enter into insert into tbl_news ops")
            print("news_content: "+str_output)
        # 执行sql语句,插入新获取的新闻信息，关键词列表，权重信�?
        cur.execute('SET NAMES utf8;')
        str_sql="INSERT INTO tbl_news (news_time, news_title, news_content, news_source,news_keyword_list,word_weight_list, create_datetime,update_datetime) VALUES(%s, %s, %s, %s, %s, %s, %s, %s);"
        cur.execute(str_sql,(str_time,'test',str_content,'sina',str_keyword, str_weights, str_time,str_time))
        # 提交到数据库执行
        # conn.commit()

        #获取新插入的新闻的id
        newsFkId = int(cur.lastrowid)
        if settings._DEBUG == True:
            print("newsFkId is ", newsFkId)

        # 2 调用restful接口进行相似度计�?
        url_simTest = settings.url_simTest

        post_data = {
            'keywordlist': str(str_keyword),
            'weightlist': str(str_weights),
        }
        if settings._DEBUG==True:
            print(url_simTest+"?keywordlist="+str_keyword+"&weightlist="+str_weights)

        if str_keyword == "":
            settings.logging.warning("keywordlist is \"\"" + str(str_content))
            continue

        try:
            data = http_post_request(settings.url_simTest, post_data)
            settings.logger.info("urlopen work, matchscore API works" + str(str_keyword))
        except:
            print("urlopen doesn't work, matchscore API not work")
            settings.logging.warning("urlopen doesn't work, matchscore API not work" + str(str_keyword))
            #comment by hisuh@20170120: 如果是exit()的话，后续代码就不在执行�?
            continue

        list_stock = data['result']['stockcodes']
        list_weights = data['result']['sims']

        # start - 20161020 - by 张欣�?
        # 采用绝对匹配方法进行计算新闻内容中是否包含股票名�?
        # 对相似度结果进行去重，对结果进行去重list_stock,list_weights�?
        abresult = abstock.methodre(str_content)

        #通过临时list去重
        list_stock2=[]
        list_weights2=[]
        if abresult:
            for i in range(0, len(list_stock)):
                if list_stock[i] not in abresult:
                    list_stock2.append(list_stock[i])
                    list_weights2.append(list_weights[i])
            list_stock=list_stock2
            list_weights=list_weights2
        list_stock2=[]
        list_weights2=[]
        # end - 20161020 - by 张欣�?

        # 循环写入到tbl_match数据�?
        for i in range(0, len(list_stock)):
            #执行sql语句，或者stockId;
            str_sql = "SELECT id FROM tbl_stock WHERE stock_code="+str(list_stock[i])+" ORDER BY id DESC LIMIT 1"
            if settings._DEBUG == True:
                print("str_sql: "+str_sql)
            cur.execute(str_sql)
            # 取得上个查询的结果，是单个结�?
            stockFkId = int(cur.fetchone()[0])
            if settings._DEBUG == True:
                print("stockFkId is: %d " % stockFkId)

            # 执行sql语句,插入新获取的新闻id,股票id，匹配分�?
            # ，关键词列表，权重信�?
            cur.execute('SET NAMES utf8;')

            #利用judgetradetime()来判断是否位于交易时间段，若是则查询并写入buyprice，否则不查询
            str_sql = "INSERT INTO tbl_match (news_id, stock_id, match_score, create_datetime,update_datetime) VALUES(%d, %d, %f, '%s', '%s');" % (newsFkId, stockFkId,float(list_weights[i]), str_time, str_time)
            cur.execute(str_sql)
            # 提交到数据库执行
            # conn.commit()
        
         #循环写入到tbl_match数据
        for i in range(0,len(abresult)):
            # 执行sql语句，或者stockId;
            str_sql = "SELECT id FROM tbl_stock WHERE stock_code=" + str(abresult[i]) + " ORDER BY id DESC LIMIT 1"
            if settings._DEBUG == True:
                print("str_sql: " + str_sql)
            cur.execute(str_sql)
            # 取得上个查询的结果，是单个结�?
            stockFkId = int(cur.fetchone()[0])
            print("stockFkId is: %d " % stockFkId)

            # 执行sql语句,插入新获取的新闻id,股票id，匹配分�?
            # ，关键词列表，权重信�?
            
            cur.execute('SET NAMES utf8;')
            abweight=1.0
            str_sql = "INSERT INTO tbl_match (news_id, stock_id, match_score, create_datetime, update_datetime) VALUES(%d, %d, %f, '%s', '%s');" % (
            newsFkId, stockFkId,abweight , str_time, str_time)
            cur.execute(str_sql)
        # 提交到数据库执行
        # conn.commit()
time.sleep(1)
print("success")
settings.logging.warning("success")
cur.close()
conn.commit()
conn.close()
settings.logging.warning("conn close")


