'전체 글' 33건

2020.02.16 | # -*- coding: utf-8 -*-from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom pyquery import PyQuery as pqfrom ..
2020.02.16 | https://github.com.cnpmjs.org/shengqiangzhang/examples-of-web-crawlers/tree/master/3.%E6%B7%98%E5%AE%9D%E5%B7%B2%E4%B9%B0%E5%88%B0%E7%9A%84%E5%AE%9D%E8%B4%9D%E6%95%B0%E6%8D%AE%E7%88%AC%E8%99%AB(%E5%B7%B2%E6%A8%A1%E6%8B%9F%E7%99%BB%E5%BD%95)
2020.02.16 | tep 1: Use the tool of explorer to idendify the connection with server. As the header elements the IE will send to server together with the URL. So for avoiding the mis-connection with server, Python will prepare the header information in advance.Step 2..
2020.02.15 | import osimport requests image_url = 'https://image-comic.pstatic.net/webtoon/626907/237\/20190115192015_d9f2b6e9d878a372dfa6f07ebfc5f14a_IMAG01_1.jpg' # Referer을 지정하여 수요웹툰 복학왕 페이지를 통해서 요청headers = {'Referer':..
2020.02.15 | chrome_options = webdriver.ChromeOptions()chrome_options.add_argument('--no-sandbox')chrome_options.add_argument('--disable-dev-shm-usage')chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")driv..
2020.01.29 | LYDIA 눈 코 입
2020.01.29 | 파이선 첫 셀레니움 고고싱
2020.01.29 | 파이선을 usb에 저장 휴대

# -- coding: utf-8 --from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom pyquery import PyQuery as pqfrom ..

카테고리 없음 | 2020. 2. 16. 18:54

# -*- coding: utf-8 -*-

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from time import sleep
import random

#定义一个taobao类
class taobao_infos:

#对象初始化
def __init__(self):
url = 'https://login.taobao.com/member/login.jhtml'
self.url = url

options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) # 不加载图片,加快访问速度
options.add_experimental_option('excludeSwitches', ['enable-automation']) # 此步骤很重要，设置为开发者模式，防止被各大网站识别出来使用了Selenium
self.browser = webdriver.Chrome(executable_path=chromedriver_path, options=options)

self.wait = WebDriverWait(self.browser, 10) #超时时长为10s

#登录淘宝
def login(self):

# 打开网页
self.browser.get(self.url)

# 自适应等待，点击密码登录选项
self.browser.implicitly_wait(30) #智能等待，直到网页加载完毕，最长等待时间为30s
self.browser.find_element_by_xpath('//*[@class="forget-pwd J_Quick2Static"]').click()

# 自适应等待，点击微博登录宣传
self.browser.implicitly_wait(30)
self.browser.find_element_by_xpath('//*[@class="weibo-login"]').click()

# 自适应等待，输入微博账号
self.browser.implicitly_wait(30)
self.browser.find_element_by_name('username').send_keys(weibo_username)

# 自适应等待，输入微博密码
self.browser.implicitly_wait(30)
self.browser.find_element_by_name('password').send_keys(weibo_password)

# 自适应等待，点击确认登录按钮
self.browser.implicitly_wait(30)
self.browser.find_element_by_xpath('//*[@class="btn_tip"]/a/span').click()

# 直到获取到淘宝会员昵称才能确定是登录成功
taobao_name = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.site-nav-bd > ul.site-nav-bd-l > li#J_SiteNavLogin > div.site-nav-menu-hd > div.site-nav-user > a.site-nav-login-info-nick ')))
# 输出淘宝昵称
print(taobao_name.text)

# 模拟向下滑动浏览
def swipe_down(self,second):
for i in range(int(second/0.1)):
#根据i的值，模拟上下滑动
if(i%2==0):
js = "var q=document.documentElement.scrollTop=" + str(300+400*i)
else:
js = "var q=document.documentElement.scrollTop=" + str(200 * i)
self.browser.execute_script(js)
sleep(0.1)

js = "var q=document.documentElement.scrollTop=100000"
self.browser.execute_script(js)
sleep(0.1)

# 爬取淘宝我已买到的宝贝商品数据
def crawl_good_buy_data(self):

# 对我已买到的宝贝商品数据进行爬虫
self.browser.get("https://buyertrade.taobao.com/trade/itemlist/list_bought_items.htm")

# 遍历所有页数
for page in range(1,1000):

# 等待该页面全部已买到的宝贝商品数据加载完毕
good_total = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#tp-bought-root > div.js-order-container')))

# 获取本页面源代码
html = self.browser.page_source

# pq模块解析网页源代码
doc = pq(html)

# # 存储该页已经买到的宝贝数据
good_items = doc('#tp-bought-root .js-order-container').items()

# 遍历该页的所有宝贝
for item in good_items:
good_time_and_id = item.find('.bought-wrapper-mod__head-info-cell___29cDO').text().replace('\n',"").replace('\r',"")
good_merchant = item.find('.seller-mod__container___1w0Cx').text().replace('\n',"").replace('\r',"")
good_name = item.find('.sol-mod__no-br___1PwLO').text().replace('\n', "").replace('\r', "")
# 只列出商品购买时间、订单号、商家名称、商品名称
# 其余的请自己实践获取
print(good_time_and_id, good_merchant, good_name)

print('\n\n')

# 大部分人被检测为机器人就是因为进一步模拟人工操作
# 模拟人工向下浏览商品，即进行模拟下滑操作，防止被识别出是机器人
# 随机滑动延时时间
swipe_time = random.randint(1, 3)
self.swipe_down(swipe_time)

# 等待下一页按钮出现
good_total = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.pagination-next')))
# 点击下一页按钮
good_total.click()
sleep(2)

if __name__ == "__main__":

# 使用之前请先查看当前目录下的使用说明文件README.MD
# 使用之前请先查看当前目录下的使用说明文件README.MD
# 使用之前请先查看当前目录下的使用说明文件README.MD

chromedriver_path = "c://chromedriver.exe" #改成你的chromedriver的完整路径地址
weibo_username = "改成你的微博账号" #改成你的微博账号
weibo_password = "改成你的微博密码" #改成你的微博密码

a = taobao_infos()
a.login() #登录
a.crawl_good_buy_data() #爬取淘宝我已买到的宝贝商品数据

저작자표시

https://github.com.cnpmjs.org/shengqiangzhang/examples-of-web-crawlers/tree/master/3.%E6%B7%98%E5%AE%9D%E5%B7%B2%E4%B9%B0%E5%88%B0%E7%9A%84%E5%AE%9D%E8%B4%9D%E6%95%B0%E6%8D%AE%E7%88%AC%E8%99%AB(%E5%B7%B2%E6%A8%A1%E6%8B%9F%E7%99%BB%E5%BD%95)

카테고리 없음 | 2020. 2. 16. 18:34

https://github.com.cnpmjs.org/shengqiangzhang/examples-of-web-crawlers/tree/master/3.%E6%B7%98%E5%AE%9D%E5%B7%B2%E4%B9%B0%E5%88%B0%E7%9A%84%E5%AE%9D%E8%B4%9D%E6%95%B0%E6%8D%AE%E7%88%AC%E8%99%AB(%E5%B7%B2%E6%A8%A1%E6%8B%9F%E7%99%BB%E5%BD%95)

저작자표시

tep 1: Use the tool of explorer to idendify the connection with server. As the header elements the IE will send to server together with the URL. So for avoiding the mis-connection with server, Python will prepare the header information in advance.Step 2..

카테고리 없음 | 2020. 2. 16. 18:03

Step 1: Use the tool of explorer to idendify the connection with server. As the header elements the IE will send to server together with the URL. So for avoiding the mis-connection with server, Python will prepare the header information in advance.

Step 2: Generate the URL for taobao to crawl the necessary data for DGBB such as location, production name, store name and sales quantity etc. If the program will dig in deeper we will keep the URL in the list. (In excel or JSON)

Step 3: If the URLs are not finished all, program will get one URL to download the page. And all the new URLs will be append to the URL list if the URL has not been included yet. Program will adapt the pages to JSON structure and write the necessary fields in the EXCEL file.

Step 4: Create sales report by Python.

3) Source code of Python.

Step 1: Pre-condition of the program.

1. Excel enhancement package xlsxwriter. Intall method:

PIP3 install xlsxwriter

2. If you want to store the images please include the image package. I installed failure so I skip the pictures of the products.

Step 2: Define a function for download the pages.

def getHtml(url,pro='',postdata={}):

#download the html：support cookie

#first argument is the url ，second argument is the post data.

filename = 'cookie.txt'

# declare a MozillaCookieJar object in the file

cj = http.cookiejar.MozillaCookieJar(filename)

proxy_support = urllib.request.ProxyHandler({'http':'http://'+pro})

# open the header information to cheat the server of taobao.

opener.addheaders = [('User-Agent','Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'),('Referer','http://s.m.taobao.com'),('Host', 'h5.m.taobao.com'),('Cookie',cookie)]

# open the url

urllib.request.install_opener(opener)
if postdata:

postdata = urllib.parse.urlencode(postdata)

html_bytes = urllib.request.urlopen(url, postdata.encode()).read()

else:

html_bytes = urllib.request.urlopen(url).read()

cj.save(ignore_discard=True, ignore_expires=True)

return html_bytes

Step 3: Define a function to write data to Excel file.

def writeexcel(path,dealcontent):

workbook = wx.Workbook(path)

worksheet = workbook.add_worksheet()

for j in range(0,len(dealcontent[i])):

if i!=0 and j==len(dealcontent[i])-1:

if dealcontent[i][j]=='':

worksheet.write(i,j,' ',)

else:

try:

worksheet.insert_image(i,j,dealcontent[i][j])

except:

worksheet.write(i,j,' ',)

else:

if dealcontent[i][j]:

worksheet.write(i,j,dealcontent[i][j].replace(' ',''),)

else:

worksheet.write(i,j,'',)

workbook.close()

Step 4: Write a main program.

def begin():

if __name__ == '__main__':

begin()

today=time.strftime('%Y%m%d', time.localtime())

a=time.clock()

keyword = input('Key words：')

sort = input('Sort by sales 1，Sort by price 2，Sort by price 3，Sort by credit 4，Sort by overall 5：')

try:

pages =int(input('Pages want to crawl（default 100 pages）：'))

if pages>100 or pages<=0:

print('Page number should be in 1 to 100)

pages=100

except:

pages=100

try:

man=int(input(time suspend：default 4 seconds（4）：'))

if man<=0:

man=4

except:

man=4

if sort == '1':

sortss = '_sale'

elif sort == '2':

sortss = 'bid'

elif sort=='3':

sortss='_bid'

elif sort=='4':

sortss='_ratesum'

elif sort=='5':

sortss=''

else:

sortss = '_sale'

namess=time.strftime('%Y%m%d%H%S', time.localtime())

root = '../data/'+today+'/'+namess+keyword

roota='../excel/'+today

mulu='../image/'+today+'/'+namess+keyword

createjia(root)

createjia(roota)

for page in range(0, pages):

time.sleep(man)

print('Suspend+str(man)+'second)

if sortss=='':

postdata = {

'event_submit_do_new_search_auction': 1,

'search': 'provide the search,

'_input_charset': 'utf-8',

'topSearch': 1,

'atype': 'b',

'searchfrom': 1,

'action': 'home:redirect_app_action',

'from': 1,

'q': keyword,

'sst': 1,

'n': 20,

'buying': 'buyitnow',

'm': 'api4h5',

'abtest': 16,

'wlsort': 16,

'style': 'list',

'closeModues': 'nav,selecthot,onesearch',

'page': page

}

else:

postdata = {

'event_submit_do_new_search_auction': 1,

'search': 'provide the searches,

'_input_charset': 'utf-8',

'topSearch': 1,

'atype': 'b',

'searchfrom': 1,

'action': 'home:redirect_app_action',

'from': 1,

'q': keyword,

'sst': 1,

'n': 20,

'buying': 'buyitnow',

'm': 'api4h5',

'abtest': 16,

'wlsort': 16,

'style': 'list',

'closeModues': 'nav,selecthot,onesearch',

'sort': sortss,

'page': page

}

postdata = urllib.parse.urlencode(postdata)

taobao = "http://s.m.taobao.com/search?" + postdata

print(taobao)

try:

content1 = getHtml(taobao)

file = open(root + '/' + str(page) + '.json', 'wb')

file.write(content1)

except Exception as e:

if hasattr(e, 'code'):

print('Pages not exist or timeout.')

print('Error code:', e.code)

elif hasattr(e, 'reason'):

print("Can't connect the server.")

print('Reason: ', e.reason)

else:

print(e)

files = listfiles(root, '.json')

total = []

total.append(['页数', '店名', '商品标题', '商品打折价', '发货地址', '评论数', '原价', '售出件数', '政策享受', '付款人数', '金币折扣','URL地址','图像URL','图像'])

for filename in files:

try:

doc = open(filename, 'rb')

doccontent = doc.read().decode('utf-8', 'ignore')

product = doccontent.replace(' ', '').replace('\n', '')

product = json.loads(product)

onefile = product['listItem']

except:

print("Can't get files"+ filename)

continue

for item in onefile:

itemlist = [filename, item['nick'], item['title'], item['price'], item['location'], item['commentCount']]

itemlist.append(item['originalPrice'])

# itemlist.append(item['mobileDiscount'])

itemlist.append(item['sold'])

itemlist.append(item['zkType'])

itemlist.append(item['act'])

itemlist.append(item['coinLimit'])

itemlist.append('http:'+item['url'])

total.append(itemlist)

if len(total) > 1:

writeexcel(roota +'/'+namess+keyword+ 'taobao.xlsx', total)

else:

print('nothing got from server')

b=time.clock()

print('run time：'+timetochina(b-a))

作者：哪儿黑
链接：https://www.jianshu.com/p/ffb65035fabc
来源：简书
著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。

저작자표시

import osimport requests image_url = 'https://image-comic.pstatic.net/webtoon/626907/237\/20190115192015_d9f2b6e9d878a372dfa6f07ebfc5f14a_IMAG01_1.jpg' # Referer을 지정하여 수요웹툰 복학왕 페이지를 통해서 요청headers = {'Referer':..

카테고리 없음 | 2020. 2. 15. 20:37

import os

import requests

image_url = 'https://image-comic.pstatic.net/webtoon/626907/237\

/20190115192015_d9f2b6e9d878a372dfa6f07ebfc5f14a_IMAG01_1.jpg'

# Referer을 지정하여 수요웹툰 복학왕 페이지를 통해서 요청

headers = {'Referer': 'https://comic.naver.com/webtoon/list.nhn?\

titleId=626907&weekday=wed'}

response = requests.get(image_url, headers = headers)

print(response) # 접속 확인 (200 = 성공)

image_data = response.content # binary형식

imgfile = os.path.basename(image_url) # url을 이름으로 하는 비어있는 데이터생성

#파일에 이미지 데이터를 넣기

with open(imgfile, 'wb') as f:

f.write(image_data)

저작자표시

chrome_options = webdriver.ChromeOptions()chrome_options.add_argument('--no-sandbox')chrome_options.add_argument('--disable-dev-shm-usage')chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")driv..

카테고리 없음 | 2020. 2. 15. 20:14

chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko") driver = webdriver.Chrome(chrome_options=chrome_options) headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'} url = 'https://search.rakuten.co.jp/search/mall/B1651/?f=1&grp=produc' requests.get(url, headers = headers) driver.get('https://search.rakuten.co.jp/search/mall/B1651/?f=1&grp=product', headers = headers)

저작자표시

LYDIA 눈 코 입

카테고리 없음 | 2020. 1. 29. 17:12

https://youtu.be/HRs_B_O2ZbU

LYDIA 눈 코 입

저작자표시

파이선 첫 셀레니움 고고싱

카테고리 없음 | 2020. 1. 29. 17:11

driver = webdriver.Chrome('chromedriver')
# 암묵적으로 웹 자원 로드를 위해 3초까지 기다려 준다.
driver.implicitly_wait(3)

저작자표시

파이선을 usb에 저장 휴대

파이썬 | 2020. 1. 29. 16:31

https://stackoverrun.com/ko/q/10806786

windows - usb 드라이브에서 파이썬 3을 실행 (이식 가능)

프로그래밍 클래스 중에 학교 컴퓨터 (Windows)에서 python3을 실행하고 싶습니다. 파이썬 3.1을 집에서 USB 플래시 드라이브에 설치하고 (Windows 사용) 학교에 가져 왔습니다. python3 이동성을위한 더 나은 대안을 (필자는 USB 자체에 넣을 수 있습니까?) 나는 그것을 넣을까요 내가 파일을 얻는 방법 The program ...

stackoverrun.com

stackoverrun

질문하다

검색

Running python3 from a usb drive (portably)

I want to run python3 on our school computers (under Windows) during our programming classes. I installed python 3.1 onto a USB flash drive at home (using Windows), and brought it to school. However, it gives me the following error:

The program can't start because python31.dll is missing from your computer. Try reinstalling the program to fix this problem.

How do I get the file, where do I put it (can I put it onto the USB itself?) and/or is there a better alternative for python3 portability? The reason why I don't simply use an online editor is because I also want to have pygame along with python on the USB.

◀ PREV 1 2 3 4 5 NEXT ▶

낮도깨비

'전체 글' 33건

# -- coding: utf-8 --from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECfrom pyquery import PyQuery as pqfrom ..

https://github.com.cnpmjs.org/shengqiangzhang/examples-of-web-crawlers/tree/master/3.%E6%B7%98%E5%AE%9D%E5%B7%B2%E4%B9%B0%E5%88%B0%E7%9A%84%E5%AE%9D%E8%B4%9D%E6%95%B0%E6%8D%AE%E7%88%AC%E8%99%AB(%E5%B7%B2%E6%A8%A1%E6%8B%9F%E7%99%BB%E5%BD%95)

tep 1: Use the tool of explorer to idendify the connection with server. As the header elements the IE will send to server together with the URL. So for avoiding the mis-connection with server, Python will prepare the header information in advance.Step 2..

import osimport requests image_url = 'https://image-comic.pstatic.net/webtoon/626907/237\/20190115192015_d9f2b6e9d878a372dfa6f07ebfc5f14a_IMAG01_1.jpg' # Referer을 지정하여 수요웹툰 복학왕 페이지를 통해서 요청headers = {'Referer':..

chrome_options = webdriver.ChromeOptions()chrome_options.add_argument('--no-sandbox')chrome_options.add_argument('--disable-dev-shm-usage')chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")driv..

LYDIA 눈 코 입

파이선 첫 셀레니움 고고싱

파이선을 usb에 저장 휴대

Running python3 from a usb drive (portably)

티스토리툴바