一半君的总结纸

听话只听一半君

如何自动检查cmi有没有出新的教程 – 今天你吃药了么?

因为lz不经常看教程,但是又想在教程出的第一时间入手…. orz lz有强迫症

下面是个用 selenium 来检查出没出新教程的脑残作… 当然可以试试改成用HtmlUnit Driver的,以避免跳个窗口出来(lz没有试,cmi网站那么多js不知道这个driver行不行,但是lz猜 PhantomJS 必然可以),以及加上有新教程就自动发个email的功能,然后放cron里,每天检查一次….

想试用的话,需要这些packages

pip install selenium lxml tabulate schedule
#!/usr/bin/env python2

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import selenium.common.exceptions as seleniumExceptions
# available since 2.4.0
from selenium.webdriver.support.ui import WebDriverWait
# available since 2.26.0
from selenium.webdriver.support import expected_conditions as EC

import sys
import time

from lxml import html
from tabulate import tabulate

try:
    from collections import OrderedDict
except:
    from ordereddict import OrderedDict

OnWindows = sys.platform.startswith('win')

url = 'https://cmivfx.com'
storeUrl = '%s/store' % url

def getFirefoxSession():
    browser = webdriver.Firefox()

    browser.get(storeUrl)
    if OnWindows:
        browser.maximize_window()

    time.sleep(5)

    # element = WebDriverWait(browser, 10).until(
    #     EC.presence_of_element_located((By.CLASS_NAME, "module module-collapse module-releases-new")))
    element = WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, ".module.module-collapse.module-releases-new")))

    return browser

def checkNewReleases(browser):
    # elem = browser.find_elements_by_xpath('//section[@class="module module-collapse module-releases-new"]//a')
    # elem = browser.find_elements_by_css_selector('.module.module-collapse.module-releases-new a')
    newReleases = browser.find_elements_by_css_selector('.module-releases-new')[0].get_attribute('innerHTML')
    new=html.fromstring(newReleases)
    # titles = new.findall('.//div[@class="product-box-text"]/h2')
    productBox = new.findall('.//a[@class="product-box"]')

    newReleases = []

    for p in productBox:

        vid=p.get('data-productid')
        cid=p.get('data-categoryid')
        title=p.find('div/h2').text

        # print 'vid:',vid
        # print 'cid:',cid
        # print 'name:',title

        newReleases.append((vid,cid,title))

    return newReleases

def printTable(newReleases):

    keysForTabulate = ['id', 'category','title']
    # tableForPrint = [ OrderedDict( [(k, t[k]) for k in keysForTabulate] ) for t in newReleases]
    # print tableForPrint
    print tabulate(newReleases, headers=keysForTabulate, tablefmt="grid")

def getLatestVid(newReleases):
    allVids = [ int(v) for v,c,t in newReleases ]
    return max(allVids)

if __name__ == '__main__':

    # this is an example, in fact it's 594 now
    lastVid = 592

    if len(sys.argv)==2:
        lastVid = int(sys.argv[1])

    browser = getFirefoxSession()
    newReleases = checkNewReleases(browser)

    latestVid = getLatestVid(newReleases)

    # filter new releases
    needToGetReleases = [ n for n in newReleases if int(n[0])>lastVid]

    printTable(newReleases)

    print 'Latest vid :', latestVid
    if latestVid>lastVid:
        print 'You need to fuck it ... orz'
        if needToGetReleases:
            printTable(needToGetReleases)
    else:
        print 'No need to fuck'

运行效果图如下所示:

cmiCheck

lz脑抽,加上了发email的部分

#!/usr/bin/env python2

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import selenium.common.exceptions as seleniumExceptions
# available since 2.4.0
from selenium.webdriver.support.ui import WebDriverWait
# available since 2.26.0
from selenium.webdriver.support import expected_conditions as EC

import sys
import time

from lxml import html
from tabulate import tabulate

import smtplib
from email.mime.text import MIMEText

import textwrap

try:
    from collections import OrderedDict
except:
    from ordereddict import OrderedDict

url = 'https://cmivfx.com'
storeUrl = '%s/store' % url

def getFirefoxSession():
    browser = webdriver.PhantomJS()

    browser.get(storeUrl)

    time.sleep(5)

    element = WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, ".module.module-collapse.module-releases-new")))

    return browser

def checkNewReleases(browser):
    newReleases = browser.find_elements_by_css_selector('.module-releases-new')[0].get_attribute('innerHTML')
    new=html.fromstring(newReleases)
    productBox = new.findall('.//a[@class="product-box"]')

    newReleases = []

    for p in productBox:

        vid=p.get('data-productid')
        cid=p.get('data-categoryid')
        title=p.find('div/h2').text

        newReleases.append((vid,cid,title))

    return newReleases

def printTable(newReleases,skipPrint=False,tablefmt='grid'):

    keysForTabulate = ['id', 'category','title']
    content = tabulate(newReleases, headers=keysForTabulate, tablefmt=tablefmt)
    if not skipPrint:
        print content

    return content

def getLatestVid(newReleases):
    allVids = [ int(v) for v,c,t in newReleases ]
    return max(allVids)

def sendMail(subject='New cmi tutorial to fuck !',content=None):

    msg = MIMEText(content.encode('utf-8'))
    msg['Subject'] = subject
    msg['From'] = 'nobody@fucker.com'
    msg['To'] = 'me@fucker.com'
    s = smtplib.SMTP('localhost')
    # put your email here
    s.sendmail('work@fucker.com', ['your-email-here'], msg.as_string())
    s.quit()

if __name__ == '__main__':

    lastVid = 592

    if len(sys.argv)==2:
        lastVid = int(sys.argv[1])

    browser = getFirefoxSession()
    newReleases = checkNewReleases(browser)

    latestVid = getLatestVid(newReleases)

    # filter new releases
    needToGetReleases = [ n for n in newReleases if int(n[0])>lastVid]

    printTable(newReleases)

    print 'Latest vid :', latestVid
    if latestVid>lastVid:
        print 'You need to fuck it ... orz'
        if needToGetReleases:
            printTable(needToGetReleases)
            emailBody = '''
                        need to fuck releases:

                        %s

                        all new releases:

                        %s

                        '''
            emailBody = textwrap.dedent(emailBody)
            emailBody = emailBody % (printTable(needToGetReleases,skipPrint=True,tablefmt='simple'),
                printTable(newReleases,skipPrint=True,tablefmt='simple'))

            # non-breaking space
            # emailBody = unicode(emailBody).replace(' ',unichr(160))
            sendMail(content = emailBody)
    else:
        print 'No need to fuck'

收到的email效果图 如下所示:

cmiCheckEmail

又加了个logging功能

#!/usr/bin/env python2

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import selenium.common.exceptions as seleniumExceptions
# available since 2.4.0
from selenium.webdriver.support.ui import WebDriverWait
# available since 2.26.0
from selenium.webdriver.support import expected_conditions as EC

import sys
import time

from lxml import html
from tabulate import tabulate

import smtplib
from email.mime.text import MIMEText

import textwrap
import logging

try:
    from collections import OrderedDict
except:
    from ordereddict import OrderedDict

url = 'https://cmivfx.com'
storeUrl = '%s/store' % url

formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
level = logging.DEBUG
logger = logging.getLogger('cmiCheck')
file_handler = logging.FileHandler('cmiCheck.log')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel(level)

def getFirefoxSession():
    browser = webdriver.PhantomJS()

    browser.get(storeUrl)

    time.sleep(5)

    element = WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, ".module.module-collapse.module-releases-new")))

    return browser

def checkNewReleases(browser):
    newReleases = browser.find_elements_by_css_selector('.module-releases-new')[0].get_attribute('innerHTML')
    new = html.fromstring(newReleases)
    productBox = new.findall('.//a[@class="product-box"]')

    newReleases = []

    for p in productBox:

        vid = p.get('data-productid')
        cid = p.get('data-categoryid')
        title = p.find('div/h2').text

        newReleases.append((vid, cid, title))

    return newReleases

def printTable(newReleases, skipPrint=False, tablefmt='grid'):

    keysForTabulate = ['id', 'category', 'title']
    content = tabulate(newReleases, headers=keysForTabulate, tablefmt=tablefmt)
    if not skipPrint:
        print content

    return content

def getLatestVid(newReleases):
    allVids = [int(v) for v, c, t in newReleases]
    return max(allVids)

def sendMail(subject='New cmi tutorial to fuck !', content=None):

    msg = MIMEText(content.encode('utf-8'))
    msg['Subject'] = subject
    msg['From'] = 'nobody@fucker.com'
    msg['To'] = 'me@fucker.com'
    s = smtplib.SMTP('localhost')
    # put your email here
    s.sendmail('work@fucker.com', ['your-email-here'], msg.as_string())
    s.quit()

if __name__ == '__main__':

    lastVid = 592

    if len(sys.argv) == 2:
        lastVid = int(sys.argv[1])

    logger.info('Launched Check with current latest vid : %d ' % lastVid)

    browser = getFirefoxSession()
    newReleases = checkNewReleases(browser)

    latestVid = getLatestVid(newReleases)

    # filter new releases
    needToGetReleases = [n for n in newReleases if int(n[0]) > lastVid]

    printTable(newReleases)

    print 'Latest vid :', latestVid
    if latestVid > lastVid:
        print 'You need to fuck it ... orz'
        if needToGetReleases:
            logger.info('New vids detected : %s ' % ' '.join([n[0] for n in needToGetReleases]))

            printTable(needToGetReleases)
            emailBody = '''
                        need to fuck releases:

                        %s

                        all new releases:

                        %s

                        '''
            emailBody = textwrap.dedent(emailBody)
            emailBody = emailBody % (printTable(needToGetReleases, skipPrint=True, tablefmt='simple'),
                                     printTable(newReleases, skipPrint=True, tablefmt='simple'))

            # non-breaking space
            # emailBody = unicode(emailBody).replace(' ',unichr(160))
            sendMail(content=emailBody)
    else:
        print 'No need to fuck'
        logger.info('No new vids to fuck')

log文件效果图
cmiCheckLog

试了下放cron里运行不了,脑残法:

# 每天运行一次 ...
watch -n 86400 cmiCheck.py

貌似设不了这么大,最后用了schedule package

import schedule
import time

def job():
    print("I'm working...")

schedule.every(10).minutes.do(job)
schedule.every().hour.do(job)
schedule.every().day.at("10:30").do(job)

while True:
    schedule.run_pending()
    time.sleep(1)

最终版

#!/usr/bin/env python2

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import selenium.common.exceptions as seleniumExceptions
# available since 2.4.0
from selenium.webdriver.support.ui import WebDriverWait
# available since 2.26.0
from selenium.webdriver.support import expected_conditions as EC

import sys
import time

from lxml import html
from tabulate import tabulate

import smtplib
from email.mime.text import MIMEText

import textwrap
import logging
import schedule
import argparse

try:
    from collections import OrderedDict
except:
    from ordereddict import OrderedDict

url = 'https://cmivfx.com'
storeUrl = '%s/store' % url

formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
level = logging.DEBUG
logger = logging.getLogger('cmiCheck')
file_handler = logging.FileHandler('cmiCheck.log')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.setLevel(level)

NOTIFY_EMAIL = 'your-email-here'

def getFirefoxSession():
    browser = webdriver.PhantomJS()

    browser.get(storeUrl)

    time.sleep(5)

    element = WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, ".module.module-collapse.module-releases-new")))

    return browser


def checkNewReleases(browser):
    newReleases = browser.find_elements_by_css_selector('.module-releases-new')[0].get_attribute('innerHTML')
    new = html.fromstring(newReleases)
    productBox = new.findall('.//a[@class="product-box"]')

    newReleases = []

    for p in productBox:

        vid = p.get('data-productid')
        cid = p.get('data-categoryid')
        title = p.find('div/h2').text

        newReleases.append((vid, cid, title))

    return newReleases


def printTable(newReleases, skipPrint=False, tablefmt='grid'):

    keysForTabulate = ['id', 'category', 'title']
    content = tabulate(newReleases, headers=keysForTabulate, tablefmt=tablefmt)
    if not skipPrint:
        print content

    return content


def getLatestVid(newReleases):
    allVids = [int(v) for v, c, t in newReleases]
    return max(allVids)


def sendMail(subject='New cmi tutorial to fuck !', content=None):

    msg = MIMEText(content.encode('utf-8'))
    msg['Subject'] = subject
    msg['From'] = 'nobody@fucker.com'
    msg['To'] = 'me@fucker.com'
    s = smtplib.SMTP('localhost')
    # put your email here
    s.sendmail('work@fucker.com', [NOTIFY_EMAIL], msg.as_string())
    s.quit()

def run(args):

    print 'Running cmiCheck ...'

    lastVid = int(args.lastVid)

    logger.info('Launched Check with current latest vid : %d ' % lastVid)

    browser = getFirefoxSession()
    newReleases = checkNewReleases(browser)

    latestVid = getLatestVid(newReleases)

    # filter new releases
    needToGetReleases = [n for n in newReleases if int(n[0]) > lastVid]

    printTable(newReleases)

    print 'Latest vid :', latestVid
    if latestVid > lastVid:
        print 'You need to fuck it ... orz'
        if needToGetReleases:
            logger.info('New vids detected : %s ' % ' '.join([n[0] for n in needToGetReleases]))

            printTable(needToGetReleases)
            emailBody = '''
                        need to fuck releases:

                        %s

                        all new releases:

                        %s

                        '''
            emailBody = textwrap.dedent(emailBody)
            emailBody = emailBody % (printTable(needToGetReleases, skipPrint=True, tablefmt='simple'),
                                     printTable(newReleases, skipPrint=True, tablefmt='simple'))

            # non-breaking space
            # emailBody = unicode(emailBody).replace(' ',unichr(160))
            sendMail(content=emailBody)
    else:
        print 'No need to fuck'
        logger.info('No new vids to fuck')

    print 'Next run:',schedule.next_run().strftime('%m/%d/%Y %H:%M:%S')

def _parseArgs():

    parser = argparse.ArgumentParser()
    parser.add_argument('-v','--lastVid',required=False, default='595')
    parser.add_argument('-s','--scheduled',required=False, default=False, action='store_true')
    
    args = parser.parse_args()
    return args

if __name__ == '__main__':

    args = _parseArgs()

    if args.scheduled:
        schObj = schedule.every(1).day.at("12:44")
        schObj.do(lambda :run(args))

        print 'Next run:',schedule.next_run().strftime('%m/%d/%Y %H:%M:%S')
        while True:
            time.sleep(120)
            schedule.run_pending()

    else:
        run(args)

ps:
已测试PhantomJS的确可以,只要把开浏览器的那行换成

browser = webdriver.PhantomJS()

即可

Advertisements

发表评论

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / 更改 )

Twitter picture

You are commenting using your Twitter account. Log Out / 更改 )

Facebook photo

You are commenting using your Facebook account. Log Out / 更改 )

Google+ photo

You are commenting using your Google+ account. Log Out / 更改 )

Connecting to %s

%d 博主赞过: