点击流网络抓取和分析

来自集智百科
跳转到: 导航搜索

Clickstreamnetwork 2013 11 29.png

    # -*- coding: utf-8 -*-
 
    import urllib2
    import re
    import time
    import random
    import numpy as np
    import networkx as nx
    from bs4 import BeautifulSoup
 
 
    #==================Compete experiment=========================
 
    #------------definie scraping function----------------
 
    def getUpStream(name):
        url="https://apps.compete.com/sites/"+name+"/iref/iref_all/?apikey=407a987a97bc689a8962f23791b5c4e8"
        #new key : 5cbfb94915b8da4bd826f678d7c0989e    
        html = urllib2.urlopen(url).read()
        h1=re.split('"incoming_referral"',html)
        sites=[]
        traffic=[]
        for i in h1[3:]:
            j=re.split('\,',i)
            sites.append(re.split('\"',j[0])[1])
            traffic.append(re.split('\"',j[3])[2][1:])
        targets=[name]*len(sites)
        data=zip(sites,targets,traffic)
        return data
 
 
    #--------------test running time-------------------
 
    def test(name):
        start = time.clock()
        data=getUpStream(name)
        duration = time.clock()-start
        return [duration, data]
 
    d,s=test("pku.edu.cn")
    print d
 
    #-------------three level clickstream network---------
 
    l1=getUpStream("google.com")
    s1=np.unique(np.array(l1)[:,0])
    l2=map(getUpStream,s1)
 
 
 
    #==================Alexa experiment=========================
 
    def getUpStreams(aName):
        try:
            url = "http://www.alexa.com/siteinfo/" + aName
            html = urllib2.urlopen(url).read()
            time.sleep(random.random())
            soup = BeautifulSoup(html)
            r = soup.select("body strong a")[1].string
            size=1000/int(r)
            a=soup.select("#upstream-content")[0] # select by id
            c=a.select("a")
            sites=[]
            for i in c:
                sites.append(i.string)
            b=a.select('td[class=text-right]')
            traffic=[]
            for i in b:
                traffic.append(float(i.select("span")[0].string.strip("%"))*size/100)
            me = [aName]*len(sites)
            data = np.array(zip(sites[:10],me,traffic[:10]))
            return data
        except:
            pass
 
    def getTraffic(aName):
        try:
            url = "http://www.alexa.com/siteinfo/" + aName
            html = urllib2.urlopen(url).read()
            time.sleep(random.random())
            soup = BeautifulSoup(html)
            r = soup.select("body strong a")[1].string
            size=1000/int(r)
            return size
        except:
            pass
 
    data1=getUpStreams("google.com")
    seed1=np.unique(data1[:,0])
    data2=map(getUpStreams,seed1)
 
 
    data2=np.vstack(data2)
    seed2=np.unique(data2[:,0])
    data3=map(getUpStreams,seed2)
    data3=np.vstack(data3)
    seed3=np.unique(data3[:,0])
    data=np.concatenate((data1,data2,data3),axis=0)
 
    #================analyze network==============================
 
    G=nx.DiGraph()
    for i in data:
        if float(i[2])>1:
            G.add_edge(i[0],i[1],weight=float(i[2]))
 
    sizes=map(getTraffic,G.nodes())
 
    fig = plt.figure()
    fig.patch.set_facecolor('black')
    #node_color=[float(G.degree(v)) for v in G]
    pos=nx.graphviz_layout(G)
    nx.draw(G, pos, node_size = np.array(sizes)*3, node_color = 1-(np.log(np.array(sizes)))**(-0.3),
            alpha=0.5,
            edge_color="gray", 
            width =0.5, with_labels= True,font_size=10)  
 
    #plt.savefig("E:/exampleNetwork.png")
个人工具
名字空间
操作
导航
工具箱