# -*- coding: utf-8 -*- import requests,sys,re,os,datetime from conf import cookies ABS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/' if len(sys.argv)<2: #with open(ABS_PATH+'CRM_url.txt') as f: # url = f.read().strip() print 'lack argv 1'; exit(); else: url = sys.argv[1] browser_headers = [{"name":"Accept","value":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},{"name":"Accept-Encoding","value":"gzip, deflate, br"},{"name":"Accept-Language","value":"zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3"},{"name":"Cache-Control","value":"max-age=0"},{"name":"Connection","value":"keep-alive"},{"name":"Cookie","value":"SSESS4011c0b2eda9010d1ba2efb4ef9cbb0b=lQULWi1sGNpW_S-v6BI1dZE3SMSXBHQTtSUdAQamROM; has_js=1; Drupal.visitor.login_history=d67362e47906c39c161279487563cd6dffe4137ecf1759cc163ec64afe3485ba-f12496dd4e1bbf37a0232a5a1e7b2148449e888c1d1b16896e16ce073cdcfd05-490"},{"name":"Host","value":"aiacademy.neticrm.tw"},{"name":"TE","value":"Trailers"},{"name":"Upgrade-Insecure-Requests","value":"1"},{"name":"User-Agent","value":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:70.0) Gecko/20100101 Firefox/70.0"}] headers = {} for v in browser_headers: headers[v['name']] = v['value'] headers = { 'Host': 'aiacademy.neticrm.tw', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:70.0) Gecko/20100101 Firefox/70.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-TW,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', #'Referer': 'https://aiacademy.neticrm.tw/civicrm/group/search?reset=1&force=1&context=smog&gid=212', #'Cookie':'SSESS4011c0b2eda9010d1ba2efb4ef9cbb0b=E8CsN9ZJ42k_WfnfaM0SbFrYhEbXb8_5ON4I0KEcrkQ; Drupal.visitor.login_history=3678a8e8cd962aaeb73bd10606a2f05f014709d80863f375329494996d1b8cb3-f12496dd4e1bbf37a0232a5a1e7b2148449e888c1d1b16896e16ce073cdcfd05-511; has_js=1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0', 'TE': 'Trailers', } now = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) res = requests.get(url, cookies=cookies, headers=headers); #res = requests.get(url, cookies=cookies); with open(ABS_PATH+'datetime.txt','a+') as f: f.write(now+"\n") #if res.status_code==200: if True: s = res.text s = s.encode('utf8') with open(ABS_PATH+'tmp.txt','w') as f: f.write(s) #print s dic = {} # key is cid, value is [name, email] s2 = s.strip() a = re.findall(r'cid=[0-9]+">.*?<\/td>', s2) for v in a: _ = re.findall(r'[0-9]+', v) if not _: continue cid = _[0] if ('(' in v) and (')' in v): name = v.split('(')[1].split(')')[0] else: name = v.split('>')[1].split('<')[0] email = s2.split(v) email = email[1] email = email.split('>') email = email[1] email = email.split('<')[0] dic[cid] = [name.strip(),email.strip()] output = '' for k,v in dic.items(): output += '`'.join([k,v[0],v[1]])+"\n" print output else: print "fail.", res.status_code print url print cookies