Общая логика такова:
1. В админке задаём урл ресурса с проксями, xpath-выражения для ip, port, type. По крону обходим все ресурсы, собираем адреса и кладем в базу.
2. Другой скрипт их постепенно чекает на анонимность, локацию и скорость. Полученный ответ записываем, в виде статуса: OK, TIMEOUT, DELETE. Со статусом DELETE удаляем через 1 день. Со статусом TIMEOUT проходим еще раз (в настройках ограничиваем количество таких обращений, и если прокси не ожил, ставим ему статус DELETE).
4. Выборку проксей с минимальным ответом и используем по назначению.
Для работы с xpath использовал плагин FF FirePath. После установки FirePath в FireBug появляется дополнительная вкладка.
Для проверки анонимности на сервер положим простой скрипт ip.php:
$ip = getenv(REMOTE_ADDR); echo $ip;
Далее код:
class ProxyServersSource implements EntityInterface { /** * Identifier. * * @var integer $id * * @ORM\Id * @ORM\Column(type="integer") * @ORM\GeneratedValue(strategy="AUTO") */ protected $id; /** * Domain. * * @var string $domain * * @Assert\Length(max=255) * @ORM\Column(type="string", length=255, nullable=true) */ protected $domain; /** * URI Pattern. * * @var string $uriPattern * * @Assert\Length(max=255) * @ORM\Column(type="string", length=255, nullable=true) */ protected $uriPattern; /** * Start number page (or count proxy). * * @var int $startNumber * @ORM\Column(type="integer", nullable=true) */ protected $startNumber; /** * Step. * * @var int $stepNumber * @ORM\Column(type="integer", nullable=true) */ protected $stepNumber; /** * Enr number page (or count proxy). * * @var string $endNumber * * @var int $endNumber * @ORM\Column(type="integer", nullable=true) */ protected $endNumber; /** * XPath IP * * @var string $xPathIp * * @Assert\Length(max=255) * @ORM\Column(type="string", length=255) */ protected $xPathIp; /** * XPath port * * @var string $xPathPort * * @Assert\Length(max=255) * @ORM\Column(type="string", length=255) */ protected $xPathPort; /** * XPath type proxy * * @var string $xPathType * * @Assert\Length(max=255) * @ORM\Column(type="string", length=255, nullable=true) */ protected $xPathType; /** * Active flag. * * @var boolean $active * * @ORM\Column(type="boolean") */ protected $active; /** * Row version. * * @var int * * @ORM\Column(type="integer") * @ORM\Version */ protected $version; /** * Constructor. */ public function __construct() { } /** * Get id * * @return int */ public function getId() { return $this->id; } /** * Set id * * @param int $id */ public function setId($id) { $this->id = $id; } /** * Get domain * * @return string */ public function getDomain() { return $this->domain; } /** * Set domain * * @param string $domain */ public function setDomain($domain) { $this->domain = $domain; } /** * Get URI pattern * * @return string */ public function getUriPattern() { return $this->uriPattern; } /** * Set URI pattern * * @param string $uriPattern */ public function setUriPattern($uriPattern) { $this->uriPattern = $uriPattern; } /** * Get start number * * @return string */ public function getStartNumber() { return $this->startNumber; } /** * Set start number * * @param string $startNumber */ public function setStartNumber($startNumber) { $this->startNumber = $startNumber; } /** * Get step number * * @return int */ public function getStepNumber() { return $this->stepNumber; } /** * Set step number * * @param int $stepNumber */ public function setStepNumber($stepNumber) { $this->stepNumber = $stepNumber; } /** * Get end number * * @return int */ public function getEndNumber() { return $this->endNumber; } /** * Set end number * * @param int $endNumber */ public function setEndNumber($endNumber) { $this->endNumber = $endNumber; } /** * Get XPATH IP * * @return string */ public function getXPathIp() { return $this->xPathIp; } /** * Set XPATH IP * * @param string $xPathIp */ public function setXPathIp($xPathIp) { $this->xPathIp = $xPathIp; } /** * Get XPATH port * * @return string */ public function getXPathPort() { return $this->xPathPort; } /** * Set XPATH port * * @param string $xPathPort */ public function setXPathPort($xPathPort) { $this->xPathPort = $xPathPort; } /** * Get XPATH type * * @return string */ public function getXPathType() { return $this->xPathType; } /** * Set XPATH type * * @param string $xPathType */ public function setXPathType($xPathType) { $this->xPathType = $xPathType; } /** * Get active * * @return boolean */ public function getActive() { return $this->active; } /** * Set active * * @param boolean $active */ public function setActive($active) { $this->active = $active; } /** * Get version * * @return int */ public function getVersion() { return $this->version; } /** * Set version * * @param int $version */ public function setVersion($version) { $this->version = $version; } }
<?php class GeneralProxy implements EntityInterface { const TYPE_HTTP = 'CURLPROXY_HTTP'; const TYPE_SOCKS4 = 'CURLPROXY_SOCKS4'; const TYPE_SOCKS5 = 'CURLPROXY_SOCKS5'; const STATUS_OK = 0; const STATUS_TIMEOUT = 1; const STATUS_DELETE = 2; /** * Identifier. * * @var integer $id * * @ORM\Id * @ORM\Column(type="integer") * @ORM\GeneratedValue(strategy="AUTO") */ protected $id; /** * IP address * * @var string $ip * * @Assert\Length(max=255) * @ORM\Column(type="string", length=255) */ protected $ip; /** * Port * * @var int $port * * @ORM\Column(type="integer") */ protected $port; /** * Country * * @var string $country * * @Assert\Length(max=255) * @ORM\Column(type="string", length=255, nullable=true) */ protected $country; /** * Speed proxy * * @var int $speed * * @ORM\Column(type="integer", nullable=true) */ protected $speed; /** * Type proxy * * @var string $type * * @Assert\Length(max=255) * @ORM\Column(type="string", length=255, nullable=true) */ protected $type; /** * Check time * * @var \DateTime $checkTime * @ORM\Column(type="datetime", nullable=true) */ protected $checkTime; /** * Status proxy * * @var int $status * * @ORM\Column(type="integer", nullable=true) */ protected $status; /** * Counter for timeout. * * @var int $countTimeout * * @ORM\Column(type="integer") */ protected $countTimeout; /** * Active flag. * * @var boolean $active * * @ORM\Column(type="boolean") */ protected $active; /** * Row version. * * @var int * * @ORM\Column(type="integer") * @ORM\Version */ protected $version; /** * Constructor. */ public function __construct() { $this->active = true; $this->countTimeout = 0; } /** * Get id * * @return int */ public function getId() { return $this->id; } /** * Set id * * @param int $id */ public function setId($id) { $this->id = $id; } /** * Get IP * * @return string */ public function getIp() { return $this->ip; } /** * Set IP * * @param string $ip */ public function setIp($ip) { $this->ip = $ip; } /** * Get port * * @return string */ public function getPort() { return $this->port; } /** * Set port * * @param string $port */ public function setPort($port) { $this->port = $port; } /** * Get country * * @return string */ public function getCountry() { return $this->country; } /** * Set country * * @param string $country */ public function setCountry($country) { $this->country = $country; } /** * Get speed proxy * * @return int */ public function getSpeed() { return $this->speed; } /** * Set speed proxy * * @param int $speed */ public function setSpeed($speed) { $this->speed = $speed; } /** * Get type * * @return string */ public function getType() { return $this->type; } /** * Set type * * @param string $type */ public function setType($type) { $this->type = $type; } /** * Get checkTime * * @return string */ public function getCheckTime() { return $this->checkTime; } /** * Set CheckTime * * @param string $checkTime */ public function setCheckTime($checkTime) { $this->checkTime = $checkTime; } /** * Get status proxy * * @return int */ public function getStatus() { return $this->status; } /** * Set status proxy * * @param int $status */ public function setStatus($status) { $this->status = $status; } /** * Get countTimeout * * @return int */ public function getCountTimeout() { return $this->countTimeout; } /** * Set countTimeout * * @param int $countTimeout */ public function setCountTimeout($countTimeout) { $this->countTimeout = $countTimeout; } /** * Get active * * @return boolean */ public function getActive() { return $this->active; } /** * Set active * * @param boolean $active */ public function setActive($active) { $this->active = $active; } /** * Get version * * @return int */ public function getVersion() { return $this->version; } /** * Set version * * @param int $version */ public function setVersion($version) { $this->version = $version; } }
class ProxiesParser { /** @var \Doctrine\ORM\EntityManager */ protected $em; /** @var \ProxyBundle\Entity\GeneralProxy */ protected $generalProxyRepo; /** @var \Monolog\Logger */ protected $logger; /** @var \ProxyBundle\Entity\ProxyServersSource */ protected $proxyServersSource; /** @var string */ protected $html; /** * ProxiesParser constructor. * * @param Registry $doctrine * @param Logger $logger */ public function __construct(Registry $doctrine, Logger $logger) { $this->em = $doctrine->getManager(); $this->logger = $logger; $this->generalProxyRepo = $this->em->getRepository('ProxyBundle:GeneralProxy'); } /** * @param ProxyServersSource $proxyServersSource */ public function setProxyServersSource(ProxyServersSource $proxyServersSource) { $this->proxyServersSource = $proxyServersSource; } /** * Parse current page in proxyServersSource * * @return array */ public function parse() { $result = []; $doc = new \DOMDocument(); libxml_use_internal_errors(true); $doc->loadHTML($this->html); $xpath = new \DOMXPath($doc); if ($this->proxyServersSource->getXPathIp() == $this->proxyServersSource->getXPathPort()) { $ips = $xpath->query($this->proxyServersSource->getXPathIp()); foreach ($ips as $key => $ip) { $ip = explode(":", $ip->nodeValue); $ipPort = []; $ipPort['ip'] = trim($ip[0]); $ipPort['port'] = trim($ip[1]); $ipPort['type'] = 'HTTP'; $result[$key] = $ipPort; } } else { $ips = $xpath->query($this->proxyServersSource->getXPathIp()); $ports = $xpath->query($this->proxyServersSource->getXPathPort()); foreach ($ips as $key => $ip) { $ipPort = []; $ipPort['ip'] = $ip->nodeValue; $ipPort['port'] = $ports[$key]->nodeValue; $ipPort['type'] = 'HTTP'; $result[$key] = $ipPort; } } if (null !== $this->proxyServersSource->getXPathType()) { $types = $xpath->query($this->proxyServersSource->getXPathType()); foreach ($types as $key => $type) { $result[$key]['type'] = $type->nodeValue; } } return $result; } /** * Crawled proxyServersSource and save proxy to GeneralProxy. * */ public function process() { foreach ($this->prepareUrls() as $url) { $this->logger->info('source URL for parsing: ' . $url); $this->html = $this->doRequest($url); $proxylist = $this->parse(); foreach ($proxylist as $proxy) { $generalProxy = $this->generalProxyRepo->findOneBy(['ip' => $proxy['ip'], 'port' => $proxy['port']]); if (null === $generalProxy) { $generalProxy = new GeneralProxy(); $generalProxy->setIp($proxy['ip']); $generalProxy->setPort($proxy['port']); $generalProxy->setType($this->prepareType($proxy['type'])); $this->em->persist($generalProxy); } } $this->em->flush(); } } /** * Prepare Type * * @param $type * * @return string */ private function prepareType($type) { if (false !== strpos($type, 'SOCKS4')) { return GeneralProxy::TYPE_SOCKS4; } elseif (false !== strpos($type, 'SOCKS5')) { return GeneralProxy::TYPE_SOCKS5; } return GeneralProxy::TYPE_HTTP; } /** * Prepare array URLs * * @return \Generator */ private function prepareUrls() { $step = (null !== $this->proxyServersSource->getStepNumber()) ? $this->proxyServersSource->getStepNumber() : 1; $i = (null !== $this->proxyServersSource->getStartNumber()) ? $this->proxyServersSource->getStartNumber() : 0; $end = (null !== $this->proxyServersSource->getEndNumber()) ? $this->proxyServersSource->getEndNumber() : 1; while ($i <= $end) { $prepareUri = $this->prepareUri($this->proxyServersSource->getUriPattern(), $i); $url = $this->proxyServersSource->getDomain() . $prepareUri; $i = $i + $step; yield $url; } } /** * Prepare URI * * @param $uriPattern * @param $startNumber * * @return string */ private function prepareUri($uriPattern, $startNumber) { return (null !== $uriPattern) ? sprintf($uriPattern, $startNumber) : ""; } /** * Do cURL request. * * @param string $url Page URL * * @return mixed * @throws \RuntimeException */ private function doRequest($url) { // Initializing curl $ch = curl_init($url); // Configuring curl options $options = [ CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => 10, CURLOPT_USERAGENT => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'. ' Ubuntu Chromium/52.0.2743.116 Chrome/52.0.2743.116 Safari/537.36' ]; // Setting curl options curl_setopt_array($ch, $options); // Getting jSON result string $result = curl_exec($ch); if (false === $result) { throw new \RuntimeException(sprintf('cURL error: %s (%d)', curl_error($ch), curl_errno($ch))); } return $result; } }
class ParseProxyServersSourceCommand extends ContainerAwareCommand { /** @var \ProxyBundle\Service\Parser\ProxiesParser */ protected $proxiesParser; /** @var \Doctrine\ORM\EntityManager */ protected $em; /** * {@inheritdoc} */ protected function configure() { $this->setName('proxyservers:sources:parse')->setDescription( 'Parse proxyserver lists' )->setHelp( <<<EOT The <info>proxyservers:sources:parse</info> parse proxy list for source urls. <info>php app/console proxyservers:sources:parse</info> EOT ); } /** * Initialize Command * * @param \Symfony\Component\Console\Input\InputInterface $input Input * @param \Symfony\Component\Console\Output\OutputInterface $output Output * * @return void * * @throws \Exception */ protected function initialize(InputInterface $input, OutputInterface $output) { parent::initialize($input, $output); $this->proxiesParser = $this->getContainer()->get('service.proxies_parser'); $this->em = $this->getContainer()->get('doctrine.orm.entity_manager'); if ($output->getVerbosity() !== OutputInterface::VERBOSITY_QUIET) { $output->writeln('<info>Start parse:</info>'); } } /** * {@inheritdoc} */ protected function execute(InputInterface $input, OutputInterface $output) { /** @var \ProxyBundle\Entity\ProxyServersSourceRepository $proxyServersSourcesRepository */ $proxyServersSourcesRepository = $this->em->getRepository('ProxyBundle:ProxyServersSource'); $proxyServersSources = $proxyServersSourcesRepository->findBy(["active" => 1]); foreach ($proxyServersSources as $proxyServersSource) { /** @var \ProxyBundle\Entity\ProxyServersSource $proxyServersSource */ if ($proxyServersSource instanceof ProxyServersSource) { $this->proxiesParser->setProxyServersSource($proxyServersSource); $this->proxiesParser->process(); } } if ($output->getVerbosity() !== OutputInterface::VERBOSITY_QUIET) { $output->writeln(''); $output->writeln("<info>Work complete successfully.</info>"); } } }
class CheckProxyServersCommand extends ContainerAwareCommand { const REMOTE_TEST_URL = "http://example.com/ip.php"; const DELETE_AFTER_DAY = 1; const MAX_COUNT_TIMEOUT = 5; const CURLOPT_TIMEOUT = 20; /** @var \Doctrine\ORM\EntityManager */ protected $em; /** * {@inheritdoc} */ protected function configure() { $this->setName('proxyservers:proxies:check')->setDescription( 'Parse proxyserver lists' )->setHelp( <<<EOT The <info>proxyservers:proxies:check</info> check proxy list. <info>php app/console proxyservers:proxies:check</info> EOT ); } /** * Initialize Command * * @param \Symfony\Component\Console\Input\InputInterface $input Input * @param \Symfony\Component\Console\Output\OutputInterface $output Output * * @return void * * @throws \Exception */ protected function initialize(InputInterface $input, OutputInterface $output) { parent::initialize($input, $output); $this->em = $this->getContainer()->get('doctrine.orm.entity_manager'); if ($output->getVerbosity() !== OutputInterface::VERBOSITY_QUIET) { $output->writeln('<info>Start check:</info>'); } } /** * {@inheritdoc} */ protected function execute(InputInterface $input, OutputInterface $output) { $generalProxyRepository = $this->em->getRepository('ProxyBundle:GeneralProxy'); $generalProxies = $generalProxyRepository->findBy(["status" => GeneralProxy::STATUS_DELETE]); $now = new \DateTime(); foreach ($generalProxies as $generalProxy) { /** @var \ProxyBundle\Entity\GeneralProxy $generalProxy */ /** @var \DateTime $checkTime */ $checkTime = $generalProxy->getCheckTime(); if (null !== $checkTime) { $interval = $now->diff($checkTime); if ($interval->d > self::DELETE_AFTER_DAY) { $this->em->remove($generalProxy); } } else { $this->em->remove($generalProxy); } } $this->em->flush(); $generalProxies = $generalProxyRepository->findBy(["active" => 1]); foreach ($generalProxies as $generalProxy) { /** @var \ProxyBundle\Entity\GeneralProxy $generalProxy */ $data = $this->doRequest($generalProxy); $now = new \DateTime(); if ($data['response'] == $generalProxy->getIp()) { $generalProxy->setStatus(GeneralProxy::STATUS_OK); $generalProxy->setSpeed($data['getinfo']['total_time']); $generalProxy->setCountTimeout(0); $generalProxy->setCheckTime($now); } elseif ($this->checkOldTimeout($data, $generalProxy->getCountTimeout())) { $generalProxy->setStatus(GeneralProxy::STATUS_DELETE); } elseif ($this->checkTimeout($data)) { $generalProxy->setStatus(GeneralProxy::STATUS_TIMEOUT); $generalProxy->setCountTimeout($generalProxy->getCountTimeout()+1); if ($generalProxy->getStatus() !== GeneralProxy::STATUS_TIMEOUT) { $generalProxy->setCheckTime($now); } } else { $generalProxy->setStatus(GeneralProxy::STATUS_DELETE); if (null === $generalProxy->getCheckTime()) { $generalProxy->setCheckTime($now); } } $this->em->flush(); } if ($output->getVerbosity() !== OutputInterface::VERBOSITY_QUIET) { $output->writeln(''); $output->writeln("<info>Work complete successfully.</info>"); } } /** * Do cURL request. * * @param GeneralProxy $proxy GeneralProxy * * @return array */ private function doRequest($proxy) { // Initializing curl $ch = curl_init(self::REMOTE_TEST_URL); // Configuring curl options $options = [ CURLOPT_PROXY => $proxy->getIp(), CURLOPT_PROXYPORT => $proxy->getPort(), CURLOPT_PROXYTYPE => $proxy->getType(), CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT => self::CURLOPT_TIMEOUT, CURLOPT_USERAGENT => 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'. ' Ubuntu Chromium/52.0.2743.116 Chrome/52.0.2743.116 Safari/537.36' ]; // Setting curl options curl_setopt_array($ch, $options); // Getting jSON result string $result = curl_exec($ch); return ['response' => $result, 'getinfo' => curl_getinfo($ch), 'errno' => curl_errno($ch)]; } /** * @param $data * @param int $countTimeout * @return bool */ private function checkOldTimeout($data, $countTimeout) { return $this->checkTimeout($data) && ($countTimeout > self::MAX_COUNT_TIMEOUT); } /** * @param $data * @return bool */ private function checkTimeout($data) { return (false === $data['response']) && ($data['errno'] === CURLE_OPERATION_TIMEOUTED); } }