henryzhou Make robot converse with human naturally

爬虫环境安装

2019-03-11
Henryzhou

spider Note

python环境安装
  • python3.7
请求库的安装
  • 请求库requests的安装:conda install requests -n spider

  • 自动化测试工具Senlenium的安装:conda install selenium -n spider

  • senlenium驱动chrome浏览器工具chromedriver添加到PATH中:

    ~/.bashrc最后添加:PATH=/home/henry/opt:$PATH
    source ~/.bashrc
    
  • 使用phantomJS实现后台的浏览器控制

    下载phantomJS,并且将其路径添加到系统变量中
    
  • 安装异步web请求库aiohttp

    conda install aiohttp -n spider
    
解析库的安装
  • lxml的安装,支持HTML、XML的解析,支持XPath解析方式

    conda install lxml -n spider
    
  • beautifulsoup4的安装,其依赖lxml

    conda install beautifulsoup4 -n spider
    
  • pyquery的安装

    conda install pyquery -n spider
    
  • 安装验证码识别工具tesseract

    sudo apt install -y tesseract-ocr libtesseract-dev libleptonica-dev
    conda activate spider
    pip install tesserocr pillow
    
数据库的安装
  • mysql的安装

    sudo apt update
    sudo apt install -y mysql-server mysql-client
    #mysql启动、关闭、重启命令
    #sudo service mysql start
    #sudo service mysql stop
    #sudo service mysql restart
    
    mysql5.7以上不会有密码设置过程,需要手动配置
    $ mysql -u debian-sys-maint -p
    #密码在/etc/mysql/debian.cnf文件中可查看(V3XKquzHqnW18GWc),在mysql命令行中执行下列语句
      
    show databases
    use mysql;
    update user set authentication_string=PASSWORD("zhou") where user='root';
    update user set plugin="mysql_native_password";
    flush privileges;
    quit;
    
    #完全卸载mysql的方法
    sudo apt purge mysql-*
    sudo rm -rf /etc/mysql/ /var/lib/mysql
    sudo apt autoremove
    
  • 安装redis

    sudo apt install redis-server
    
存储库的安装
  • 安装pymysql存储库以使用python和mysql交互

    conda install pymysql -n spider
    
  • 安装redis_py

    conda install redis -n spider
    
Web库的安装
  • 安装Flask web库来搭建一些API接口,供爬虫使用,后面会利用Flask+Redis维护动态代理池和Cookies池

    conda install tornado -n spider
    
  • 安装tornado,后面会使用tornado+redis来搭建一个ADSL拨号代理池

爬虫框架的安装
  • pyspider的安装

    pip install pyspider
    #测试pyspider
    pyspider all
      
    #有报错!!!
    
  • 安装scrapy

    conda install scrapy
    
  • 安装scrapy-splash

    #安装docker
    curl -sSL https://get.docker.com/ | sh
    #安装scrapy-splash
    docker run -p 8050:8050 scrapinghub/splash
    #安装scrapy-splash的python库
    pip install scrapy-splash
    
  • 安装scrapy-redis

    pip install scrapy-redis
    
  • 安装scrapyd,用于部署和运行scrapy项目的工具

    pip install scrapyd
    
    sudo mkdir /etc/scrapyd
    sudo gedit /etc/scrapyd/scrapyd.conf
      
    #写入下面内容
    [scrapyd]
    eggs_dir = eggs
    logs_dir = logs
    items_dir =
    jobs_to_keep = 5
    dbs_dir = dbs
    max_proc = 0
    max_proc_per_cpu = 10
    finished_to_keep = 100
    poll_interval = 5.0
    bind_address = 0.0.0.0
    http_port = 6800
    debug = off
    runner = scrapyd.runner
    application = scrapyd.app.application
    launcher = scrapyd.launcher.Launcher
    webroot = scrapyd.website.Root
      
    [services]
    schedule.json = scrapyd.webservice.Schedule
    cancel.json = scrapyd.webservice.Cancel
    addversion = scrapyd.webservice.AddVersion
    listprojects.json = scrapyd.webservice.ListProjects
    listversions.json = scrapyd.webservice.ListVersions
    listspiders.json = scrapyd.webservice.ListSpiders
    delproject.json = scrapyd.webservice.DeleteProject
    delversion.json = scrapyd.webservice.DeleteVersion
    listjobs.json = scrapyd.webservice.ListJobs
    daemonstatus.json = scrapyd.webservice.DaemonStatus
      
    
    #后台运行scrapyd
    scrapyd > /dev/null &
    
  • 安装nginx

    sudo apt install nginx
    
    #配置/etc/nginx/nginx.conf,增加以下内容
    http{
    	server{
    	    listen 6801;
    	    location / { 
    		proxy_pass	http://127.0.0.1:6800/;
    		auth_basic	"Restricted";
    		auth_basic_user_file	/etc/nginx/conf.d/.htpasswd;
    			}
    		}
    }
    
    sudo apt install apache2-utils
    htpasswd -c .htpasswd admin
    sudo nginx -s reload
    
  • ScrapydAPI的安装

    pip install pthon-scrapyd-api
    
  • 安装scrapyrt

    pip install scrapyrt
    
  • 安装Gerapy

    pip install gerapy
    

上一篇 Markdown_Guide

Comments

Content