Прокси чекер

Общая логика такова:
1. В админке задаём урл ресурса с проксями, xpath-выражения для ip, port, type. По крону обходим все ресурсы, собираем адреса и кладем в базу.
2. Другой скрипт их постепенно чекает на анонимность, локацию и скорость. Полученный ответ записываем, в виде статуса: OK, TIMEOUT, DELETE. Со статусом DELETE удаляем через 1 день. Со статусом TIMEOUT проходим еще раз (в настройках ограничиваем количество таких обращений, и если прокси не ожил, ставим ему статус DELETE).
4. Выборку проксей с минимальным ответом и используем по назначению.

Для работы с xpath использовал плагин FF FirePath. После установки FirePath в FireBug появляется дополнительная вкладка.
Для проверки анонимности на сервер положим простой скрипт ip.php:

$ip = getenv(REMOTE_ADDR);
echo $ip;

Далее код:

class ProxyServersSource implements EntityInterface
{
    /**
     * Identifier.
     *
     * @var integer $id
     *
     * @ORM\Id
     * @ORM\Column(type="integer")
     * @ORM\GeneratedValue(strategy="AUTO")
     */
    protected $id;

    /**
     * Domain.
     *
     * @var string $domain
     *
     * @Assert\Length(max=255)
     * @ORM\Column(type="string", length=255, nullable=true)
     */
    protected $domain;

    /**
     * URI Pattern.
     *
     * @var string $uriPattern
     *
     * @Assert\Length(max=255)
     * @ORM\Column(type="string", length=255, nullable=true)
     */
    protected $uriPattern;

    /**
     * Start number page (or count proxy).
     *
     * @var int $startNumber
     * @ORM\Column(type="integer", nullable=true)
     */
    protected $startNumber;

    /**
     * Step.
     *
     * @var int $stepNumber
     * @ORM\Column(type="integer", nullable=true)
     */
    protected $stepNumber;

    /**
     * Enr number page (or count proxy).
     *
     * @var string $endNumber
     *
     * @var int $endNumber
     * @ORM\Column(type="integer", nullable=true)
     */
    protected $endNumber;

    /**
     * XPath IP
     *
     * @var string $xPathIp
     *
     * @Assert\Length(max=255)
     * @ORM\Column(type="string", length=255)
     */
    protected $xPathIp;

    /**
     * XPath port
     *
     * @var string $xPathPort
     *
     * @Assert\Length(max=255)
     * @ORM\Column(type="string", length=255)
     */
    protected $xPathPort;

    /**
     * XPath type proxy
     *
     * @var string $xPathType
     *
     * @Assert\Length(max=255)
     * @ORM\Column(type="string", length=255, nullable=true)
     */
    protected $xPathType;

    /**
     * Active flag.
     *
     * @var boolean $active
     *
     * @ORM\Column(type="boolean")
     */
    protected $active;

    /**
     * Row version.
     *
     * @var int
     *
     * @ORM\Column(type="integer")
     * @ORM\Version
     */
    protected $version;


    /**
     * Constructor.
     */
    public function __construct()
    {
    }

    /**
     * Get id
     *
     * @return int
     */
    public function getId()
    {
        return $this->id;
    }

    /**
     * Set id
     *
     * @param int $id
     */
    public function setId($id)
    {
        $this->id = $id;
    }

    /**
     * Get domain
     *
     * @return string
     */
    public function getDomain()
    {
        return $this->domain;
    }

    /**
     * Set domain
     *
     * @param string $domain
     */
    public function setDomain($domain)
    {
        $this->domain = $domain;
    }

    /**
     * Get URI pattern
     *
     * @return string
     */
    public function getUriPattern()
    {
        return $this->uriPattern;
    }

    /**
     * Set URI pattern
     *
     * @param string $uriPattern
     */
    public function setUriPattern($uriPattern)
    {
        $this->uriPattern = $uriPattern;
    }

    /**
     * Get start number
     *
     * @return string
     */
    public function getStartNumber()
    {
        return $this->startNumber;
    }

    /**
     * Set start number
     *
     * @param string $startNumber
     */
    public function setStartNumber($startNumber)
    {
        $this->startNumber = $startNumber;
    }

    /**
     * Get step number
     *
     * @return int
     */
    public function getStepNumber()
    {
        return $this->stepNumber;
    }

    /**
     * Set step number
     *
     * @param int $stepNumber
     */
    public function setStepNumber($stepNumber)
    {
        $this->stepNumber = $stepNumber;
    }

    /**
     * Get end number
     *
     * @return int
     */
    public function getEndNumber()
    {
        return $this->endNumber;
    }

    /**
     * Set end number
     *
     * @param int $endNumber
     */
    public function setEndNumber($endNumber)
    {
        $this->endNumber = $endNumber;
    }

    /**
     * Get XPATH IP
     *
     * @return string
     */
    public function getXPathIp()
    {
        return $this->xPathIp;
    }

    /**
     * Set XPATH IP
     *
     * @param string $xPathIp
     */
    public function setXPathIp($xPathIp)
    {
        $this->xPathIp = $xPathIp;
    }

    /**
     * Get XPATH port
     *
     * @return string
     */
    public function getXPathPort()
    {
        return $this->xPathPort;
    }

    /**
     * Set XPATH port
     *
     * @param string $xPathPort
     */
    public function setXPathPort($xPathPort)
    {
        $this->xPathPort = $xPathPort;
    }

    /**
     * Get XPATH type
     *
     * @return string
     */
    public function getXPathType()
    {
        return $this->xPathType;
    }

    /**
     * Set XPATH type
     *
     * @param string $xPathType
     */
    public function setXPathType($xPathType)
    {
        $this->xPathType = $xPathType;
    }

    /**
     * Get active
     *
     * @return boolean
     */
    public function getActive()
    {
        return $this->active;
    }

    /**
     * Set active
     *
     * @param boolean $active
     */
    public function setActive($active)
    {
        $this->active = $active;
    }

    /**
     * Get version
     *
     * @return int
     */
    public function getVersion()
    {
        return $this->version;
    }

    /**
     * Set version
     *
     * @param int $version
     */
    public function setVersion($version)
    {
        $this->version = $version;
    }
}

 

<?php

class GeneralProxy implements EntityInterface
{
    const TYPE_HTTP   = 'CURLPROXY_HTTP';
    const TYPE_SOCKS4 = 'CURLPROXY_SOCKS4';
    const TYPE_SOCKS5 = 'CURLPROXY_SOCKS5';

    const STATUS_OK = 0;
    const STATUS_TIMEOUT = 1;
    const STATUS_DELETE = 2;

    /**
     * Identifier.
     *
     * @var integer $id
     *
     * @ORM\Id
     * @ORM\Column(type="integer")
     * @ORM\GeneratedValue(strategy="AUTO")
     */
    protected $id;

    /**
     * IP address
     *
     * @var string $ip
     *
     * @Assert\Length(max=255)
     * @ORM\Column(type="string", length=255)
     */
    protected $ip;

    /**
     * Port
     *
     * @var int $port
     *
     * @ORM\Column(type="integer")
     */
    protected $port;

    /**
     * Country
     *
     * @var string $country
     *
     * @Assert\Length(max=255)
     * @ORM\Column(type="string", length=255, nullable=true)
     */
    protected $country;

    /**
     * Speed proxy
     *
     * @var int $speed
     *
     * @ORM\Column(type="integer", nullable=true)
     */
    protected $speed;

    /**
     * Type proxy
     *
     * @var string $type
     *
     * @Assert\Length(max=255)
     * @ORM\Column(type="string", length=255, nullable=true)
     */
    protected $type;

    /**
     * Check time
     *
     * @var \DateTime $checkTime
     * @ORM\Column(type="datetime", nullable=true)
     */
    protected $checkTime;

    /**
     * Status proxy
     *
     * @var int $status
     *
     * @ORM\Column(type="integer", nullable=true)
     */
    protected $status;

    /**
     * Counter for timeout.
     *
     * @var int $countTimeout
     *
     * @ORM\Column(type="integer")
     */
    protected $countTimeout;

    /**
     * Active flag.
     *
     * @var boolean $active
     *
     * @ORM\Column(type="boolean")
     */
    protected $active;

    /**
     * Row version.
     *
     * @var int
     *
     * @ORM\Column(type="integer")
     * @ORM\Version
     */
    protected $version;


    /**
     * Constructor.
     */
    public function __construct()
    {
        $this->active = true;
        $this->countTimeout = 0;
    }

    /**
     * Get id
     *
     * @return int
     */
    public function getId()
    {
        return $this->id;
    }

    /**
     * Set id
     *
     * @param int $id
     */
    public function setId($id)
    {
        $this->id = $id;
    }

    /**
     * Get IP
     *
     * @return string
     */
    public function getIp()
    {
        return $this->ip;
    }

    /**
     * Set IP
     *
     * @param string $ip
     */
    public function setIp($ip)
    {
        $this->ip = $ip;
    }

    /**
     * Get port
     *
     * @return string
     */
    public function getPort()
    {
        return $this->port;
    }

    /**
     * Set port
     *
     * @param string $port
     */
    public function setPort($port)
    {
        $this->port = $port;
    }

    /**
     * Get country
     *
     * @return string
     */
    public function getCountry()
    {
        return $this->country;
    }

    /**
     * Set country
     *
     * @param string $country
     */
    public function setCountry($country)
    {
        $this->country = $country;
    }

    /**
     * Get speed proxy
     *
     * @return int
     */
    public function getSpeed()
    {
        return $this->speed;
    }

    /**
     * Set speed proxy
     *
     * @param int $speed
     */
    public function setSpeed($speed)
    {
        $this->speed = $speed;
    }

    /**
     * Get type
     *
     * @return string
     */
    public function getType()
    {
        return $this->type;
    }

    /**
     * Set type
     *
     * @param string $type
     */
    public function setType($type)
    {
        $this->type = $type;
    }

    /**
     * Get checkTime
     *
     * @return string
     */
    public function getCheckTime()
    {
        return $this->checkTime;
    }

    /**
     * Set CheckTime
     *
     * @param string $checkTime
     */
    public function setCheckTime($checkTime)
    {
        $this->checkTime = $checkTime;
    }

    /**
     * Get status proxy
     *
     * @return int
     */
    public function getStatus()
    {
        return $this->status;
    }

    /**
     * Set status proxy
     *
     * @param int $status
     */
    public function setStatus($status)
    {
        $this->status = $status;
    }

    /**
     * Get countTimeout
     *
     * @return int
     */
    public function getCountTimeout()
    {
        return $this->countTimeout;
    }

    /**
     * Set countTimeout
     *
     * @param int $countTimeout
     */
    public function setCountTimeout($countTimeout)
    {
        $this->countTimeout = $countTimeout;
    }

    /**
     * Get active
     *
     * @return boolean
     */
    public function getActive()
    {
        return $this->active;
    }

    /**
     * Set active
     *
     * @param boolean $active
     */
    public function setActive($active)
    {
        $this->active = $active;
    }

    /**
     * Get version
     *
     * @return int
     */
    public function getVersion()
    {
        return $this->version;
    }

    /**
     * Set version
     *
     * @param int $version
     */
    public function setVersion($version)
    {
        $this->version = $version;
    }
}
class ProxiesParser
{
    /** @var \Doctrine\ORM\EntityManager */
    protected $em;

    /** @var \ProxyBundle\Entity\GeneralProxy */
    protected $generalProxyRepo;

    /** @var \Monolog\Logger */
    protected $logger;

    /** @var \ProxyBundle\Entity\ProxyServersSource */
    protected $proxyServersSource;

    /** @var string */
    protected $html;

    /**
     * ProxiesParser constructor.
     *
     * @param Registry $doctrine
     * @param Logger   $logger
     */
    public function __construct(Registry $doctrine, Logger $logger)
    {
        $this->em     = $doctrine->getManager();
        $this->logger = $logger;
        $this->generalProxyRepo = $this->em->getRepository('ProxyBundle:GeneralProxy');
    }

    /**
     * @param ProxyServersSource $proxyServersSource
     */
    public function setProxyServersSource(ProxyServersSource $proxyServersSource)
    {
        $this->proxyServersSource = $proxyServersSource;
    }

    /**
     * Parse current page in proxyServersSource
     *
     * @return array
     */
    public function parse()
    {
        $result = [];

        $doc = new \DOMDocument();
        libxml_use_internal_errors(true);
        $doc->loadHTML($this->html);
        $xpath = new \DOMXPath($doc);

        if ($this->proxyServersSource->getXPathIp() == $this->proxyServersSource->getXPathPort()) {
            $ips = $xpath->query($this->proxyServersSource->getXPathIp());

            foreach ($ips as $key => $ip) {
                $ip = explode(":", $ip->nodeValue);
                $ipPort = [];
                $ipPort['ip'] = trim($ip[0]);
                $ipPort['port'] = trim($ip[1]);
                $ipPort['type'] = 'HTTP';
                $result[$key] = $ipPort;
            }
        } else {
            $ips = $xpath->query($this->proxyServersSource->getXPathIp());
            $ports = $xpath->query($this->proxyServersSource->getXPathPort());

            foreach ($ips as $key => $ip) {
                $ipPort = [];
                $ipPort['ip'] = $ip->nodeValue;
                $ipPort['port'] = $ports[$key]->nodeValue;
                $ipPort['type'] = 'HTTP';
                $result[$key] = $ipPort;
            }
        }
        if (null !== $this->proxyServersSource->getXPathType()) {
            $types = $xpath->query($this->proxyServersSource->getXPathType());

            foreach ($types as $key => $type) {
                $result[$key]['type'] = $type->nodeValue;
            }
        }

        return $result;
    }

    /**
     * Crawled proxyServersSource and save proxy to GeneralProxy.
     *
     */
    public function process()
    {
        foreach ($this->prepareUrls() as $url) {
            $this->logger->info('source URL for parsing: ' . $url);
            $this->html = $this->doRequest($url);
            $proxylist = $this->parse();
            foreach ($proxylist as $proxy) {
                $generalProxy = $this->generalProxyRepo->findOneBy(['ip' => $proxy['ip'], 'port' => $proxy['port']]);
                if (null === $generalProxy) {
                    $generalProxy = new GeneralProxy();
                    $generalProxy->setIp($proxy['ip']);
                    $generalProxy->setPort($proxy['port']);
                    $generalProxy->setType($this->prepareType($proxy['type']));
                    $this->em->persist($generalProxy);
                }
            }
            $this->em->flush();
        }
    }

    /**
     * Prepare Type
     *
     * @param $type
     *
     * @return string
     */
    private function prepareType($type)
    {
        if (false !== strpos($type, 'SOCKS4')) {
            return GeneralProxy::TYPE_SOCKS4;
        } elseif (false !== strpos($type, 'SOCKS5')) {
            return GeneralProxy::TYPE_SOCKS5;
        }

        return GeneralProxy::TYPE_HTTP;
    }

    /**
     * Prepare array URLs
     *
     * @return \Generator
     */
    private function prepareUrls()
    {
        $step = (null !== $this->proxyServersSource->getStepNumber()) ? $this->proxyServersSource->getStepNumber() : 1;
        $i = (null !== $this->proxyServersSource->getStartNumber()) ? $this->proxyServersSource->getStartNumber() : 0;
        $end = (null !== $this->proxyServersSource->getEndNumber()) ? $this->proxyServersSource->getEndNumber() : 1;

        while ($i <= $end) {
            $prepareUri = $this->prepareUri($this->proxyServersSource->getUriPattern(), $i);
            $url = $this->proxyServersSource->getDomain() . $prepareUri;
            $i = $i + $step;

            yield $url;
        }
    }

    /**
     * Prepare URI
     *
     * @param $uriPattern
     * @param $startNumber
     *
     * @return string
     */
    private function prepareUri($uriPattern, $startNumber)
    {

        return (null !== $uriPattern) ? sprintf($uriPattern, $startNumber) : "";
    }

    /**
     * Do cURL request.
     *
     * @param string $url Page URL
     *
     * @return mixed
     * @throws \RuntimeException
     */
    private function doRequest($url)
    {
        // Initializing curl
        $ch = curl_init($url);

        // Configuring curl options
        $options = [
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_TIMEOUT        => 10,
            CURLOPT_USERAGENT      => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'.
                ' Ubuntu Chromium/52.0.2743.116 Chrome/52.0.2743.116 Safari/537.36'
        ];
        // Setting curl options
        curl_setopt_array($ch, $options);
        // Getting jSON result string
        $result = curl_exec($ch);

        if (false === $result) {
            throw new \RuntimeException(sprintf('cURL error: %s (%d)', curl_error($ch), curl_errno($ch)));
        }

        return $result;
    }
}

 

class ParseProxyServersSourceCommand extends ContainerAwareCommand
{
    /** @var \ProxyBundle\Service\Parser\ProxiesParser */
    protected $proxiesParser;

    /** @var \Doctrine\ORM\EntityManager */
    protected $em;

    /**
     * {@inheritdoc}
     */
    protected function configure()
    {
        $this->setName('proxyservers:sources:parse')->setDescription(
            'Parse proxyserver lists'
        )->setHelp(
            <<<EOT
            The <info>proxyservers:sources:parse</info> parse proxy list for source urls.

<info>php app/console proxyservers:sources:parse</info>
EOT
        );
    }

    /**
     * Initialize Command
     *
     * @param \Symfony\Component\Console\Input\InputInterface   $input  Input
     * @param \Symfony\Component\Console\Output\OutputInterface $output Output
     *
     * @return void
     *
     * @throws \Exception
     */
    protected function initialize(InputInterface $input, OutputInterface $output)
    {
        parent::initialize($input, $output);

        $this->proxiesParser = $this->getContainer()->get('service.proxies_parser');
        $this->em      = $this->getContainer()->get('doctrine.orm.entity_manager');

        if ($output->getVerbosity() !== OutputInterface::VERBOSITY_QUIET) {
            $output->writeln('<info>Start parse:</info>');
        }
    }

    /**
     * {@inheritdoc}
     */
    protected function execute(InputInterface $input, OutputInterface   $output)
    {
        /** @var \ProxyBundle\Entity\ProxyServersSourceRepository $proxyServersSourcesRepository */
        $proxyServersSourcesRepository = $this->em->getRepository('ProxyBundle:ProxyServersSource');
        $proxyServersSources = $proxyServersSourcesRepository->findBy(["active" => 1]);

        foreach ($proxyServersSources as $proxyServersSource) {
            /** @var \ProxyBundle\Entity\ProxyServersSource $proxyServersSource */
            if ($proxyServersSource instanceof ProxyServersSource) {
                $this->proxiesParser->setProxyServersSource($proxyServersSource);
                $this->proxiesParser->process();
            }
        }

        if ($output->getVerbosity() !== OutputInterface::VERBOSITY_QUIET) {
            $output->writeln('');
            $output->writeln("<info>Work complete successfully.</info>");
        }
    }
}

 

class CheckProxyServersCommand extends ContainerAwareCommand
{
    const REMOTE_TEST_URL   = "http://example.com/ip.php";
    const DELETE_AFTER_DAY  = 1;
    const MAX_COUNT_TIMEOUT = 5;
    const CURLOPT_TIMEOUT   = 20;

    /** @var \Doctrine\ORM\EntityManager */
    protected $em;

    /**
     * {@inheritdoc}
     */
    protected function configure()
    {
        $this->setName('proxyservers:proxies:check')->setDescription(
            'Parse proxyserver lists'
        )->setHelp(
            <<<EOT
            The <info>proxyservers:proxies:check</info> check proxy list.

<info>php app/console proxyservers:proxies:check</info>
EOT
        );
    }

    /**
     * Initialize Command
     *
     * @param \Symfony\Component\Console\Input\InputInterface   $input  Input
     * @param \Symfony\Component\Console\Output\OutputInterface $output Output
     *
     * @return void
     *
     * @throws \Exception
     */
    protected function initialize(InputInterface $input, OutputInterface $output)
    {
        parent::initialize($input, $output);

        $this->em = $this->getContainer()->get('doctrine.orm.entity_manager');

        if ($output->getVerbosity() !== OutputInterface::VERBOSITY_QUIET) {
            $output->writeln('<info>Start check:</info>');
        }
    }

    /**
     * {@inheritdoc}
     */
    protected function execute(InputInterface $input, OutputInterface   $output)
    {
        $generalProxyRepository = $this->em->getRepository('ProxyBundle:GeneralProxy');

        $generalProxies = $generalProxyRepository->findBy(["status" => GeneralProxy::STATUS_DELETE]);
        $now = new \DateTime();
        foreach ($generalProxies as $generalProxy) {
            /** @var \ProxyBundle\Entity\GeneralProxy $generalProxy */
            /** @var \DateTime $checkTime */
            $checkTime = $generalProxy->getCheckTime();
            if (null !== $checkTime) {
                $interval = $now->diff($checkTime);
                if ($interval->d > self::DELETE_AFTER_DAY) {
                    $this->em->remove($generalProxy);
                }
            } else {
                $this->em->remove($generalProxy);
            }
        }
        $this->em->flush();

        $generalProxies = $generalProxyRepository->findBy(["active" => 1]);
        foreach ($generalProxies as $generalProxy) {
            /** @var \ProxyBundle\Entity\GeneralProxy $generalProxy */
            $data = $this->doRequest($generalProxy);
            $now = new \DateTime();
            if ($data['response'] == $generalProxy->getIp()) {
                $generalProxy->setStatus(GeneralProxy::STATUS_OK);
                $generalProxy->setSpeed($data['getinfo']['total_time']);
                $generalProxy->setCountTimeout(0);
                $generalProxy->setCheckTime($now);
            } elseif ($this->checkOldTimeout($data, $generalProxy->getCountTimeout())) {
                $generalProxy->setStatus(GeneralProxy::STATUS_DELETE);
            } elseif ($this->checkTimeout($data)) {
                $generalProxy->setStatus(GeneralProxy::STATUS_TIMEOUT);
                $generalProxy->setCountTimeout($generalProxy->getCountTimeout()+1);
                if ($generalProxy->getStatus() !== GeneralProxy::STATUS_TIMEOUT) {
                    $generalProxy->setCheckTime($now);
                }
            } else {
                $generalProxy->setStatus(GeneralProxy::STATUS_DELETE);
                if (null  === $generalProxy->getCheckTime()) {
                    $generalProxy->setCheckTime($now);
                }
            }
            $this->em->flush();
        }


        if ($output->getVerbosity() !== OutputInterface::VERBOSITY_QUIET) {
            $output->writeln('');
            $output->writeln("<info>Work complete successfully.</info>");
        }
    }

    /**
     * Do cURL request.
     *
     * @param GeneralProxy $proxy GeneralProxy
     *
     * @return array
     */
    private function doRequest($proxy)
    {
        // Initializing curl
        $ch = curl_init(self::REMOTE_TEST_URL);

        // Configuring curl options
        $options = [
            CURLOPT_PROXY          => $proxy->getIp(),
            CURLOPT_PROXYPORT      => $proxy->getPort(),
            CURLOPT_PROXYTYPE      => $proxy->getType(),
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_TIMEOUT        => self::CURLOPT_TIMEOUT,
            CURLOPT_USERAGENT      => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'.
                ' Ubuntu Chromium/52.0.2743.116 Chrome/52.0.2743.116 Safari/537.36'
        ];
        // Setting curl options
        curl_setopt_array($ch, $options);
        // Getting jSON result string
        $result = curl_exec($ch);

        return ['response' => $result, 'getinfo' => curl_getinfo($ch), 'errno' => curl_errno($ch)];
    }

    /**
     * @param $data
     * @param int $countTimeout
     * @return bool
     */
    private function checkOldTimeout($data, $countTimeout)
    {
        return $this->checkTimeout($data) && ($countTimeout > self::MAX_COUNT_TIMEOUT);
    }

    /**
     * @param $data
     * @return bool
     */
    private function checkTimeout($data)
    {
        return (false === $data['response']) && ($data['errno'] === CURLE_OPERATION_TIMEOUTED);
    }
}