2023-01-23 11:03:31 +01:00

501 lines
16 KiB
PHP

<?php
/**
* Piwik - free/libre analytics platform
*
* @link http://piwik.org
* @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
*
*/
namespace Piwik\Plugins\Referrers;
use Piwik\Cache;
use Piwik\Common;
use Piwik\Option;
use Piwik\Piwik;
use Piwik\Singleton;
use Piwik\UrlHelper;
/**
* Contains methods to access search engine definition data.
*/
class SearchEngine extends Singleton
{
const OPTION_STORAGE_NAME = 'SearchEngineDefinitions';
/** @var string location of definition file (relative to PIWIK_INCLUDE_PATH) */
const DEFINITION_FILE = '/vendor/matomo/searchengine-and-social-list/SearchEngines.yml';
protected $definitionList = null;
/**
* Returns list of search engines by URL
*
* @return array Array of ( URL => array( searchEngineName, keywordParameter, path, charset ) )
*/
public function getDefinitions()
{
$cache = Cache::getEagerCache();
$cacheId = 'SearchEngine-' . self::OPTION_STORAGE_NAME;
if ($cache->contains($cacheId)) {
$list = $cache->fetch($cacheId);
} else {
$list = $this->loadDefinitions();
$cache->save($cacheId, $list);
}
return $list;
}
private function loadDefinitions()
{
if (empty($this->definitionList)) {
// Read first from the auto-updated list in database
$list = Option::get(self::OPTION_STORAGE_NAME);
if ($list) {
$this->definitionList = Common::safe_unserialize(base64_decode($list));
} else {
// Fallback to reading the bundled list
$yml = file_get_contents(PIWIK_INCLUDE_PATH . self::DEFINITION_FILE);
$this->definitionList = $this->loadYmlData($yml);
Option::set(self::OPTION_STORAGE_NAME, base64_encode(serialize($this->definitionList)));
}
}
Piwik::postEvent('Referrer.addSearchEngineUrls', array(&$this->definitionList));
$this->convertLegacyDefinitions();
return $this->definitionList;
}
/**
* @deprecated remove in 3.0
*/
protected function convertLegacyDefinitions()
{
foreach ($this->definitionList as $url => $definition) {
if (!array_key_exists('name', $definition) && isset($definition[0]) && isset($definition[1])) {
$this->definitionList[$url] = array(
'name' => $definition[0],
'params' => $definition[1],
'backlink' => @$definition[2],
'charsets' => @$definition[3]
);
}
}
}
/**
* Parses the given YML string and caches the resulting definitions
*
* @param string $yml
* @return array
*/
public function loadYmlData($yml)
{
$searchEngines = \Spyc::YAMLLoadString($yml);
$this->definitionList = $this->transformData($searchEngines);
return $this->definitionList;
}
protected function transformData($searchEngines)
{
$urlToInfo = array();
foreach ($searchEngines as $name => $info) {
if (empty($info) || !is_array($info)) {
continue;
}
foreach ($info as $urlDefinitions) {
foreach ($urlDefinitions['urls'] as $url) {
$searchEngineData = $urlDefinitions;
unset($searchEngineData['urls']);
$searchEngineData['name'] = $name;
$urlToInfo[$url] = $searchEngineData;
}
}
}
return $urlToInfo;
}
/**
* Returns list of search engines by name
*
* @return array Array of ( searchEngineName => URL )
*/
public function getNames()
{
$cacheId = 'SearchEngine.getSearchEngineNames';
$cache = Cache::getTransientCache();
$nameToUrl = $cache->fetch($cacheId);
if (empty($nameToUrl)) {
$searchEngines = $this->getDefinitions();
$nameToUrl = array();
foreach ($searchEngines as $url => $info) {
if (!isset($nameToUrl[$info['name']])) {
$nameToUrl[$info['name']] = $url;
}
}
$cache->save($cacheId, $nameToUrl);
}
return $nameToUrl;
}
/**
* Returns definitions for the given search engine host
*
* @param string $host
* @return array
*/
public function getDefinitionByHost($host)
{
$searchEngines = $this->getDefinitions();
if (!array_key_exists($host, $searchEngines)) {
return array();
}
return $searchEngines[$host];
}
/**
* Extracts a keyword from a raw not encoded URL.
* Will only extract keyword if a known search engine has been detected.
* Returns the keyword:
* - in UTF8: automatically converted from other charsets when applicable
* - strtolowered: "QUErY test!" will return "query test!"
* - trimmed: extra spaces before and after are removed
*
* The function returns false when a keyword couldn't be found.
* eg. if the url is "http://www.google.com/partners.html" this will return false,
* as the google keyword parameter couldn't be found.
*
* @see unit tests in /tests/core/Common.test.php
* @param string $referrerUrl URL referrer URL, eg. $_SERVER['HTTP_REFERER']
* @return array|bool false if a keyword couldn't be extracted,
* or array(
* 'name' => 'Google',
* 'keywords' => 'my searched keywords')
*/
public function extractInformationFromUrl($referrerUrl)
{
$referrerParsed = @parse_url($referrerUrl);
$referrerHost = '';
if (isset($referrerParsed['host'])) {
$referrerHost = $referrerParsed['host'];
}
if (empty($referrerHost)) {
return false;
}
// some search engines (eg. Bing Images) use the same domain
// as an existing search engine (eg. Bing), we must also use the url path
$referrerPath = '';
if (isset($referrerParsed['path'])) {
$referrerPath = $referrerParsed['path'];
}
$query = '';
if (isset($referrerParsed['query'])) {
$query = $referrerParsed['query'];
}
// Google Referrers URLs sometimes have the fragment which contains the keyword
if (!empty($referrerParsed['fragment'])) {
$query .= '&' . $referrerParsed['fragment'];
}
$referrerHost = $this->getEngineHostFromUrl($referrerHost, $referrerPath, $query);
if (empty($referrerHost)) {
return false;
}
$definitions = $this->getDefinitionByHost($referrerHost);
$searchEngineName = $definitions['name'];
$variableNames = $definitions['params'];
$keywordsHiddenFor = !empty($definitions['hiddenkeyword']) ? $definitions['hiddenkeyword'] : array();
$key = null;
if ($searchEngineName === 'Google Images') {
if (strpos($query, '&prev') !== false) {
$query = urldecode(trim(UrlHelper::getParameterFromQueryString($query, 'prev')));
$query = str_replace('&', '&amp;', strstr($query, '?'));
}
$searchEngineName = 'Google Images';
} elseif ($searchEngineName === 'Google'
&& (strpos($query, '&as_') !== false || strpos($query, 'as_') === 0)
) {
$keys = array();
$key = UrlHelper::getParameterFromQueryString($query, 'as_q');
if (!empty($key)) {
array_push($keys, $key);
}
$key = UrlHelper::getParameterFromQueryString($query, 'as_oq');
if (!empty($key)) {
array_push($keys, str_replace('+', ' OR ', $key));
}
$key = UrlHelper::getParameterFromQueryString($query, 'as_epq');
if (!empty($key)) {
array_push($keys, "\"$key\"");
}
$key = UrlHelper::getParameterFromQueryString($query, 'as_eq');
if (!empty($key)) {
array_push($keys, "-$key");
}
$key = trim(urldecode(implode(' ', $keys)));
}
if ($searchEngineName === 'Google') {
// top bar menu
$tbm = UrlHelper::getParameterFromQueryString($query, 'tbm');
switch ($tbm) {
case 'isch':
$searchEngineName = 'Google Images';
break;
case 'vid':
$searchEngineName = 'Google Video';
break;
case 'shop':
$searchEngineName = 'Google Shopping';
break;
}
}
if (empty($key)) {
foreach ($variableNames as $variableName) {
if ($variableName[0] == '/') {
// regular expression match
if (preg_match($variableName, $referrerUrl, $matches)) {
$key = trim(urldecode($matches[1]));
break;
}
} else {
// search for keywords now &vname=keyword
$key = UrlHelper::getParameterFromQueryString($query, $variableName);
$key = trim(urldecode($key));
// Special cases: empty keywords
if (empty($key)
&& (
// empty keyword parameter
strpos($query, sprintf('&%s=', $variableName)) !== false
|| strpos($query, sprintf('?%s=', $variableName)) !== false
)
) {
$key = false;
}
if (!empty($key)
|| $key === false
) {
break;
}
}
}
}
// if no keyword found, but empty keywords are allowed
if (!empty($keywordsHiddenFor) && ($key === null || $key === '')) {
$pathWithQueryAndFragment = $referrerPath;
if (!empty($query)) {
$pathWithQueryAndFragment .= '?'.$query;
}
if (!empty($referrerParsed['fragment'])) {
$pathWithQueryAndFragment .= '#'.$referrerParsed['fragment'];
}
foreach ($keywordsHiddenFor as $path) {
if (strlen($path) > 1 && substr($path, 0, 1) == '/' && substr($path, -1, 1) == '/') {
if (preg_match($path, $pathWithQueryAndFragment)) {
$key = false;
break;
}
} elseif ($path == $pathWithQueryAndFragment) {
$key = false;
break;
}
}
}
// $key === false is the special case "No keyword provided" which is a Search engine match
if ($key === null || $key === '') {
return false;
}
if (!empty($key)) {
if (!empty($definitions['charsets'])) {
$key = $this->convertCharset($key, $definitions['charsets']);
}
$key = Common::mb_strtolower($key);
}
return array(
'name' => $searchEngineName,
'keywords' => $key,
);
}
protected function getEngineHostFromUrl($host, $path, $query)
{
$searchEngines = $this->getDefinitions();
$hostPattern = UrlHelper::getLossyUrl($host);
/*
* Try to get the best matching 'host' in definitions
* 1. check if host + path matches an definition
* 2. check if host only matches
* 3. check if host pattern + path matches
* 4. check if host pattern matches
* 5. special handling
*/
if (array_key_exists($host . $path, $searchEngines)) {
$host = $host . $path;
} elseif (array_key_exists($host, $searchEngines)) {
// no need to change host
} elseif (array_key_exists($hostPattern . $path, $searchEngines)) {
$host = $hostPattern . $path;
} elseif (array_key_exists($hostPattern, $searchEngines)) {
$host = $hostPattern;
} elseif (!array_key_exists($host, $searchEngines)) {
if (!strncmp($query, 'cx=partner-pub-', 15)) {
// Google custom search engine
$host = 'google.com/cse';
} elseif (!strncmp($path, '/pemonitorhosted/ws/results/', 28)) {
// private-label search powered by InfoSpace Metasearch
$host = 'wsdsold.infospace.com';
} elseif (strpos($host, '.images.search.yahoo.com') != false) {
// Yahoo! Images
$host = 'images.search.yahoo.com';
} elseif (strpos($host, '.search.yahoo.com') != false) {
// Yahoo!
$host = 'search.yahoo.com';
} else {
return false;
}
}
return $host;
}
/**
* Tries to convert the given string from one of the given charsets to UTF-8
* @param string $string
* @param array $charsets
* @return string
*/
protected function convertCharset($string, $charsets)
{
if (function_exists('iconv')
&& !empty($charsets)
) {
$charset = $charsets[0];
if (count($charsets) > 1
&& function_exists('mb_detect_encoding')
) {
$charset = mb_detect_encoding($string, $charsets);
if ($charset === false) {
$charset = $charsets[0];
}
}
$newKey = @iconv($charset, 'UTF-8//IGNORE', $string);
if (!empty($newKey)) {
$string = $newKey;
}
}
return $string;
}
/**
* Return search engine URL by name
*
* @see core/DataFiles/SearchEnginges.php
*
* @param string $name
* @return string URL
*/
public function getUrlFromName($name)
{
$searchEngineNames = $this->getNames();
if (isset($searchEngineNames[$name])) {
$url = 'http://' . $searchEngineNames[$name];
} else {
$url = 'URL unknown!';
}
return $url;
}
/**
* Return search engine host in URL
*
* @param string $url
* @return string host
*/
private function getHostFromUrl($url)
{
if (strpos($url, '//')) {
$url = substr($url, strpos($url, '//') + 2);
}
if (($p = strpos($url, '/')) !== false) {
$url = substr($url, 0, $p);
}
return $url;
}
/**
* Return search engine logo path by URL
*
* @param string $url
* @return string path
* @see plugins/Morpheus/icons/dist/searchEnginges/
*/
public function getLogoFromUrl($url)
{
$pathInPiwik = 'plugins/Morpheus/icons/dist/searchEngines/%s.png';
$pathWithCode = sprintf($pathInPiwik, $this->getHostFromUrl($url));
$absolutePath = PIWIK_INCLUDE_PATH . '/' . $pathWithCode;
if (file_exists($absolutePath)) {
return $pathWithCode;
}
return sprintf($pathInPiwik, 'xx');
}
/**
* Return search engine URL for URL and keyword
*
* @see core/DataFiles/SearchEnginges.php
*
* @param string $url Domain name, e.g., search.piwik.org
* @param string $keyword Keyword, e.g., web+analytics
* @return string URL, e.g., http://search.piwik.org/q=web+analytics
*/
public function getBackLinkFromUrlAndKeyword($url, $keyword)
{
if ($keyword === API::LABEL_KEYWORD_NOT_DEFINED) {
return 'https://matomo.org/faq/general/#faq_144';
}
$keyword = urlencode($keyword);
$keyword = str_replace(urlencode('+'), urlencode(' '), $keyword);
$host = substr($url, strpos($url, '//') + 2);
$definition = $this->getDefinitionByHost($host);
if (empty($definition['backlink'])) {
return false;
}
$path = str_replace("{k}", $keyword, $definition['backlink']);
return $url . (substr($url, -1) != '/' ? '/' : '') . $path;
}
}