File indexing completed on 2024-12-22 05:36:24

0001 <?php
0002 /**
0003  *  ocs-webserver
0004  *
0005  *  Copyright 2016 by pling GmbH.
0006  *
0007  *    This file is part of ocs-webserver.
0008  *
0009  *    This program is free software: you can redistribute it and/or modify
0010  *    it under the terms of the GNU Affero General Public License as
0011  *    published by the Free Software Foundation, either version 3 of the
0012  *    License, or (at your option) any later version.
0013  *
0014  *    This program is distributed in the hope that it will be useful,
0015  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
0016  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0017  *    GNU Affero General Public License for more details.
0018  *
0019  *    You should have received a copy of the GNU Affero General Public License
0020  *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
0021  **/
0022 
0023 /** crawler detection
0024  * @param $USER_AGENT
0025  * @return bool
0026  */
0027 function crawlerDetect($USER_AGENT)
0028 {
0029     // If the user agent is empty, we assume that it is not a bot.
0030     if (empty($USER_AGENT)) {
0031         return false;
0032     }
0033 
0034     $crawlers = array(
0035         array('Googlebot', 'Googlebot'),
0036         array('MSN', 'MSN'),
0037         array('msnbot-media', 'MSN'),
0038         array('bingbot', 'MSN'),
0039         array('MegaIndex.ru' , 'MegaIndex.ru'),
0040         array('Baiduspider', 'Baiduspider'),
0041         array('YandexBot', 'YandexBot'),
0042         array('AhrefsBot', 'Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)'),
0043         array('ltx71', 'ltx71'),
0044         array('msnbot', 'MSN'),
0045         array('Rambler', 'Rambler'),
0046         array('Yahoo', 'Yahoo'),
0047         array('AbachoBOT', 'AbachoBOT'),
0048         array('accoona', 'Accoona'),
0049         array('AcoiRobot', 'AcoiRobot'),
0050         array('ASPSeek', 'ASPSeek'),
0051         array('CrocCrawler', 'CrocCrawler'),
0052         array('Dumbot', 'Dumbot'),
0053         array('FAST-WebCrawler', 'FAST-WebCrawler'),
0054         array('GeonaBot', 'GeonaBot'),
0055         array('Gigabot', 'Gigabot'),
0056         array('Lycos', 'Lycos spider'),
0057         array('MSRBOT', 'MSRBOT'),
0058         array('Scooter', 'Altavista robot'),
0059         array('AltaVista', 'Altavista robot'),
0060         array('IDBot', 'ID-Search Bot'),
0061         array('eStyle', 'eStyle Bot'),
0062         array('Scrubby', 'Scrubby robot'),
0063         array('MJ12bot','Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http://mj12bot.com/)'),
0064         array('SemrushBot', 'SemrushBot'),
0065         array('bingbot','bingbot'),
0066         array('DotBot','http://www.opensiteexplorer.org/dotbot'),
0067         array('SEOkicks','https://www.seokicks.de/robot.html'),
0068         array('CCBot','CCBot/2.0 (https://commoncrawl.org/faq/)'),
0069         array('Sogou','Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)'),
0070         array('Bytespider','Bytespider;https://zhanzhang.toutiao.com/'),
0071         array('BLEXBot','BLEXBot/1.0; +http://webmeup-crawler.com/'),
0072         array('Applebot','Applebot/0.1; +http://www.apple.com/go/applebot'),
0073         array('serpstatbot','serpstatbot/1.0 (advanced backlink tracking bot; curl/7.58.0; http://serpstatbot.com/; abuse@serpstatbot.com)'),
0074         array('Linespider','Linespider/1.1;+https://lin.ee/4dwXkTH'),
0075         array('Yeti','Yeti/1.1; +http://naver.me/spd'),
0076         array('Feedspot','Feedspot/1.0 (+https://www.feedspot.com/fs/fetcher; like FeedFetcher-Google)'),
0077         array('fantastic_search_engine_crawler','fantastic_search_engine_crawler/2.0 (Linux) fantastic-crawler@umich.edu'),
0078         array('Qwantify','Qwantify/Bleriot/1.1; +https://help.qwant.com/bot'),
0079         array('coccocbot','coccocbot-web/1.0; +http://help.coccoc.com/searchengine'),
0080         array('nagios-plugins','check_http/v2.2.1 (nagios-plugins 2.2.1)'),
0081         array('urlwatch','urlwatch/2.17 (+https://thp.io/2008/urlwatch/info.html)'),
0082         array('Buck','Buck/2.2; (+https://app.hypefactors.com/media-monitoring/about.html)'),
0083         array('Anitya','Anitya 0.17.2 at release-monitoring.org'),
0084         array('MauiBot','MauiBot (crawler.feedback+dc@gmail.com)'),
0085         array('istellabot','istellabot/t.1.13'),
0086         array('SeznamBot','Mozilla/5.0 (compatible; SeznamBot/3.2-test1; +http://napoveda.seznam.cz/en/seznambot-intro/)'),
0087         array('TelegramBot','TelegramBot (like TwitterBot)'),
0088         array('Synapse','Synapse/1.0.0'),
0089         array('VelenPublicWebCrawler','Mozilla/5.0 (compatible; VelenPublicWebCrawler/1.0; +https://velen.io)'),
0090         array('MagiBot','Mozilla/5.0 (compatible; MagiBot/1.0.0; Matarael; +https://magi.com/bots)'),
0091         array('linkfluence','Mozilla/5.0 (compatible; YaK/1.0; http://linkfluence.com/; bot@linkfluence.com)'),
0092         array('repology','repology-linkchecker/1 (+https://repology.org/bots)'),
0093         array('yacybot','Mozilla/5.0 (compatible; yacybot/1.921/custom +https://searx.everdot.org/about)'),
0094         array('facebookexternalhit','facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)'),
0095         array('ZoominfoBot','ZoominfoBot (zoominfobot at zoominfo dot com)'),
0096         array('curl','curl/7.66.0'),
0097         array('ZoomBot','ZoomBot (Linkbot 1.0 http://suite.seozoom.it/bot.html)'),
0098         array('PaperLiBot','Mozilla/5.0 (compatible; PaperLiBot/2.1; https://support.paper.li/entries/20023257-what-is-paper-li)'),
0099         array('python-requests','python-requests/2.22.0'),
0100         array('Cliqzbot','Mozilla/5.0 (compatible; Cliqzbot/3.0; +http://cliqz.com/company/cliqzbot)'),
0101         array('YisouSpider','YisouSpider'),
0102         array('trendictionbot','Mozilla/5.0 (Windows NT 10.0; Win64; x64; trendictionbot0.5.0; trendiction search; http://www.trendiction.de/bot; please let us know of any problems; web at trendiction.com) Gecko/20170101 Firefox/67.0'),
0103         array('Jetslide','Mozilla/5.0 (compatible; Jetslide; +http://jetsli.de/crawler)'),
0104         array('Seekport','Mozilla/5.0 (compatible; Seekport Crawler; http://seekport.com/)'),
0105         array('GarlikCrawler','GarlikCrawler/1.2 (http://garlik.com/, crawler@garlik.com)'),
0106         array('Mb2345Browser','Mozilla/5.0(Linux;Android 5.1.1;OPPO A33 Build/LMY47V;wv) AppleWebKit/537.36(KHTML,link Gecko) Version/4.0 Chrome/42.0.2311.138 Mobile Safari/537.36 Mb2345Browser/9.0'),
0107         array('LieBaoFast','Mozilla/5.0(Linux;Android 5.1.1;OPPO A33 Build/LMY47V;wv) AppleWebKit/537.36(KHTML,link Gecko) Version/4.0 Chrome/43.0.2357.121 Mobile Safari/537.36 LieBaoFast/4.51.3'),
0108         array('TBS/043602','Mozilla/5.0 (Linux; Android 7.0; FRD-AL00 Build/HUAWEIFRD-AL00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043602 Safari/537.36 MicroMessenger/6.5.16.1120 NetType/WIFI Language/zh_CN'),
0109         array('zh-CN;OPPO A33 Build/LMY47V','Mozilla/5.0(Linux;U;Android 5.1.1;zh-CN;OPPO A33 Build/LMY47V) AppleWebKit/537.36(KHTML,like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.7.0.953 Mobile Safari/537.36')
0110     );
0111 
0112     foreach ($crawlers as $c)
0113     {
0114         if (stristr($USER_AGENT, $c[0]))
0115         {
0116             return true;
0117         }
0118     }
0119 
0120     return false;
0121 }