diff --git a/classes/cliTool/traits/ConvertLogFile.php b/classes/cliTool/traits/ConvertLogFile.php index 747928227af..3c80e61a514 100644 --- a/classes/cliTool/traits/ConvertLogFile.php +++ b/classes/cliTool/traits/ConvertLogFile.php @@ -26,9 +26,11 @@ use APP\statistics\StatisticsHelper; use DateTime; use Exception; +use PKP\config\Config; use PKP\core\Core; use PKP\core\Registry; use PKP\db\DAORegistry; +use PKP\facades\Locale; use PKP\file\FileManager; use PKP\submission\Genre; @@ -169,7 +171,6 @@ public function convert(string $fileName): void } $newEntry['userAgent'] = $entryData['userAgent']; - $newEntry['canonicalUrl'] = $entryData['url']; [ 'workingAssocType' => $assocType, @@ -188,13 +189,23 @@ public function convert(string $fileName): void $context = $this->contextsByPath[$foundContextPath]; $newEntry['contextId'] = $context->getId(); - $this->setAssoc($assocType, $op, $args, $newEntry); + // temporarily set the canonicalUrlPage that is needed in the child class + $newEntry['canonicalUrlPage'] = $page; + $this->setAssoc($assocType, $args, $newEntry); if (!array_key_exists('assocType', $newEntry)) { if (!$this->isApacheAccessLogFile()) { fwrite(STDERR, "The URL {$entryData['url']} in the line number {$lineNumber} was not considered." . PHP_EOL); } continue; } + + $canonicalUrl = $entryData['url']; // if this is not the apache log file i.e. it is the internal log file, the URLs are already canonical + if ($this->isApacheAccessLogFile()) { + $canonicalUrl = $this->getCanonicalUrl($foundContextPath, $newEntry['canonicalUrlPage'], $newEntry['canonicalUrlOp'], $newEntry['canonicalUrlArgs'] ?? null); + } + $newEntry['canonicalUrl'] = $canonicalUrl; + // unset elements that are temporarily used and should not be logged + unset($newEntry['canonicalUrlPage'], $newEntry['canonicalUrlOp'], $newEntry['canonicalUrlArgs']); } else { continue; } @@ -408,12 +419,14 @@ protected function getExpectedPageAndOp(): array break; case 'omp': // Before 3.4 OMP did not have chapter assoc type i.e. chapter landing page - // so no need to consider it here + // consider it here however, in order to allow current apache access log file conversion $pageAndOp = $pageAndOp + [ Application::ASSOC_TYPE_SUBMISSION_FILE => [ 'catalog/download'], Application::ASSOC_TYPE_MONOGRAPH => [ 'catalog/book'], + Application::ASSOC_TYPE_CHAPTER => [ + 'catalog/book'], Application::ASSOC_TYPE_SERIES => [ 'catalog/series'] ]; @@ -478,8 +491,8 @@ protected static function getContextPaths(string $urlInfo, bool $isPathInfo): ar */ protected static function getPage(string $urlInfo, bool $isPathInfo): string { - $page = self::getUrlComponents($urlInfo, $isPathInfo, 0, 'page'); - return Core::cleanFileVar(is_null($page) ? '' : $page); + $page = self::getUrlComponents($urlInfo, $isPathInfo, self::getOffset($urlInfo, $isPathInfo, 0), 'page'); + return Core::cleanFileVar($page ?? ''); } /** @@ -489,8 +502,8 @@ protected static function getPage(string $urlInfo, bool $isPathInfo): string */ protected static function getOp(string $urlInfo, bool $isPathInfo): string { - $operation = self::getUrlComponents($urlInfo, $isPathInfo, 1, 'op'); - return Core::cleanFileVar(empty($operation) ? 'index' : $operation); + $operation = self::getUrlComponents($urlInfo, $isPathInfo, self::getOffset($urlInfo, $isPathInfo, 1), 'op'); + return Core::cleanFileVar($operation ?: 'index'); } /** @@ -501,14 +514,32 @@ protected static function getOp(string $urlInfo, bool $isPathInfo): string */ protected static function getArgs(string $urlInfo, bool $isPathInfo): array { - return self::getUrlComponents($urlInfo, $isPathInfo, 2, 'path'); + return self::getUrlComponents($urlInfo, $isPathInfo, self::getOffset($urlInfo, $isPathInfo, 2), 'path'); + } + + /** + * Get offset. Add 1 extra if localization present in URL + */ + private static function getOffset(string $urlInfo, bool $isPathInfo, int $varOffset): int + { + return $varOffset + (int) !!self::getLocalization($urlInfo, $isPathInfo); + } + + /** + * Get localization path present into the passed + * url information. + */ + public static function getLocalization(string $urlInfo, bool $isPathInfo): string + { + $locale = self::getUrlComponents($urlInfo, $isPathInfo, 0); + return Locale::isLocaleValid($locale) ? $locale : ''; } /** * Get url components (page, operation and args) * based on the passed offset. */ - protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, int $offset, string $varName = ''): mixed + protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, int $offset, string $varName = ''): array|string|null { $component = null; @@ -517,7 +548,6 @@ protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, in $isArrayComponent = true; } if ($isPathInfo) { - $application = Application::get(); $contextDepth = 1; // Was $application->getContextDepth(); $vars = explode('/', trim($urlInfo, '/')); @@ -544,10 +574,44 @@ protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, in return $component; } + /** + * Construct the URL from context path, page, op, and params + */ + protected function getCanonicalUrl(string $contextPath, string $canonicalUrlPage, string $canonicalUrlOp, array $canonicalUrlArgs = null): string + { + $canonicalUrl = Application::get()->getDispatcher()->url( + Application::get()->getRequest(), + Application::ROUTE_PAGE, + $contextPath, + $canonicalUrlPage, + $canonicalUrlOp, + $canonicalUrlArgs, + urlLocaleForPage: '' + ); + + // Make sure we log the server name and not aliases. + $configBaseUrl = Config::getVar('general', 'base_url'); + $requestBaseUrl = Application::get()->getRequest()->getBaseUrl(); + if ($requestBaseUrl !== $configBaseUrl) { + // Make sure it's not an url override (no alias on that case). + if (!in_array($requestBaseUrl, Config::getContextBaseUrls()) && + $requestBaseUrl !== Config::getVar('general', 'base_url[index]')) { + // Alias found, replace it by base_url from config file. + // Make sure we use the correct base url override value for the context, if any. + $baseUrlReplacement = Config::getVar('general', 'base_url[' . $contextPath . ']'); + if (!$baseUrlReplacement) { + $baseUrlReplacement = $configBaseUrl; + } + $canonicalUrl = str_replace($requestBaseUrl, $baseUrlReplacement, $canonicalUrl); + } + } + return $canonicalUrl; + } + /** * Set assoc type and IDs from the passed page, operation and arguments. */ - protected function setAssoc(int $assocType, string $op, array $args, array &$newEntry): void + protected function setAssoc(int $assocType, array $args, array &$newEntry): void { $application = Application::get(); $applicationName = $application->getName(); diff --git a/tools/convertApacheAccessLogFile.php b/tools/convertApacheAccessLogFile.php index e5eadf46f3c..8a08f8d3929 100644 --- a/tools/convertApacheAccessLogFile.php +++ b/tools/convertApacheAccessLogFile.php @@ -120,7 +120,7 @@ public function __construct(array $argv = []) } // This tool needs egrep path configured. - if (file_exists(self::EGREP_PATH)) { + if (!file_exists(self::EGREP_PATH)) { fwrite(STDERR, 'Error: This tool needs egrep program. Please define the constatn EGREP_PATH in this script, enter there the path to egrep command on your machine.' . PHP_EOL); exit(9); } @@ -156,7 +156,7 @@ public function isApacheAccessLogFile(): bool */ public function usage() { - echo "\nConvert the passed apache access log file into the new usage stats log file format. + echo "\nConvert the apache access log file into the new usage stats log file format. This will copy the apache access file to the usageStats/tmp/ folder in the files directory, filter entries related to this installation, split the file by day, rename the result file(s) into apache_usage_events_YYYYMMDD.log, convert them into the new JSON format, and @@ -304,6 +304,8 @@ public function splitFileByDay(string $filePath): array } // Get all days between the first and the last date, including the last date + $firstDate->setTime(0, 0, 0); + $lastDate->setTime(0, 0, 1); $period = new DatePeriod( $firstDate, new DateInterval('P1D'), @@ -408,38 +410,25 @@ protected function getExpectedPageAndOp(): array return $pageAndOp; } - /** - * Set assoc type and IDs from the passed page, operation and arguments. - */ - protected function setAssoc(int $assocType, string $op, array $args, array &$newEntry): void - { - $application = Application::get(); - $applicationName = $application->getName(); - switch ($applicationName) { - case 'ojs2': - $this->setOJSAssoc($assocType, $args, $newEntry); - break; - case 'omp': - $this->setOMPAssoc($assocType, $args, $newEntry); - break; - case 'ops': - $this->setOPSAssoc($assocType, $args, $newEntry); - break; - default: - throw new Exception('Unrecognized application name!'); - } - } - /** * Set assoc type and IDs from the passed page, operation and * arguments specific to OJS. */ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): void { + $newEntry['submissionId'] = null; + $newEntry['representationId'] = null; + $newEntry['submissionFileId'] = null; + $newEntry['fileType'] = null; + $newEntry['issueId'] = null; + $newEntry['issueGalleyId'] = null; + switch ($assocType) { case Application::getContextAssocType(): // $newEntry['contextId'] has already been set $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlPage'] = Application::SITE_CONTEXT_PATH; + $newEntry['canonicalUrlOp'] = ''; break; case Application::ASSOC_TYPE_SUBMISSION: @@ -454,6 +443,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId. @@ -470,6 +460,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL); break; } + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } elseif (count($args) == 2) { // Consider usage stats log files from releases 2.x: // The URL article/view/{$articleId}/{$galleyId} was used for assoc type galley. @@ -523,10 +514,16 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['representationId'] = $representationId; $newEntry['submissionFileId'] = $submissionFileId; $newEntry['fileType'] = $fileType; + + $newEntry['canonicalUrlPage'] = 'article'; + $newEntry['canonicalUrlOp'] = 'download'; + array_push($newEntry['canonicalUrlArgs'], $representationId, $submissionFileId); break; } $newEntry['submissionId'] = $submissionId; $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlPage'] = 'article'; + $newEntry['canonicalUrlOp'] = 'view'; break; case Application::ASSOC_TYPE_SUBMISSION_FILE: @@ -545,6 +542,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId/$representationId/$submissionFileId. @@ -564,6 +562,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v if (isset($args[4])) { $submissionFileId = (int) $args[4]; } + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } else { $representationUrlPath = $args[1]; if (isset($args[2])) { @@ -643,7 +642,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v } // is this a full text or supp file - $genreDao = DAORegistry::getDAO('GenreDAO'); + $genreDao = DAORegistry::getDAO('GenreDAO'); /** @var GenreDAO $genreDao */ $genre = $genreDao->getById($submissionFile->getData('genreId')); if ($genre->getCategory() != Genre::GENRE_CATEGORY_DOCUMENT || $genre->getSupplementary() || $genre->getDependent()) { $newEntry['assocType'] = Application::ASSOC_TYPE_SUBMISSION_FILE_COUNTER_OTHER; @@ -654,6 +653,10 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['representationId'] = $representationId; $newEntry['submissionFileId'] = $submissionFileId; $newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype')); + + $newEntry['canonicalUrlPage'] = 'article'; + $newEntry['canonicalUrlOp'] = 'download'; + array_push($newEntry['canonicalUrlArgs'], $representationId, $submissionFileId); break; case Application::ASSOC_TYPE_SUBMISSION_FILE_COUNTER_OTHER: @@ -673,6 +676,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; $galley = $submissionFile = null; $publications = $submission->getData('publications'); @@ -707,6 +711,10 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['representationId'] = $galley->getId(); $newEntry['submissionFileId'] = $submissionFile->getId(); $newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype')); + + $newEntry['canonicalUrlPage'] = 'article'; + $newEntry['canonicalUrlOp'] = 'download'; + $newEntry['canonicalUrlArgs'] = [$submissionId, $galley->getId(), $submissionFile->getId()]; } else { fwrite(STDERR, 'Supp file could not be found.' . PHP_EOL); } @@ -732,6 +740,10 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v $issueId = $issue->getId(); $newEntry['issueId'] = $issueId; $newEntry['assocType'] = $assocType; + + $newEntry['canonicalUrlPage'] = 'issue'; + $newEntry['canonicalUrlOp'] = 'view'; + $newEntry['canonicalUrlArgs'] = [$issue->getId()]; break; case Application::ASSOC_TYPE_ISSUE_GALLEY: @@ -750,7 +762,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v break; } $issueId = $issue->getId(); - $issueGalleyDao = DAORegistry::getDAO('IssueGalleyDAO'); + $issueGalleyDao = DAORegistry::getDAO('IssueGalleyDAO'); /** @var IssueGalleyDAO $issueGalleyDao */ $issueGalley = $issueGalleyDao->getByBestId($args[1], $issueId); if (!$issueGalley) { fwrite(STDERR, "Issue galley with the URL path or ID {$args[1]} does not exist in the issue with the ID {$issueId}." . PHP_EOL); @@ -759,6 +771,9 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['issueId'] = $issueId; $newEntry['issueGalleyId'] = $issueGalley->getId(); $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlPage'] = 'issue'; + $newEntry['canonicalUrlOp'] = 'download'; + $newEntry['canonicalUrlArgs'] = [$issue->getId(), $issueGalley->getId()]; break; } } @@ -769,10 +784,18 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v */ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): void { + $newEntry['submissionId'] = null; + $newEntry['representationId'] = null; + $newEntry['submissionFileId'] = null; + $newEntry['fileType'] = null; + $newEntry['chapterId'] = null; + $newEntry['seriesId'] = null; + switch ($assocType) { case Application::getContextAssocType(): // $newEntry['contextId'] has already been set $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlOp'] = $newEntry['canonicalUrlPage'] == 'catalog' ? 'index' : ''; break; case Application::ASSOC_TYPE_SUBMISSION: @@ -787,6 +810,7 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId. @@ -801,6 +825,7 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL); break; } + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } // Is it a chapter landing page @@ -827,11 +852,14 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v fwrite(STDERR, "Chapter with the ID {$chapterId} does not exist." . PHP_EOL); break; } + array_push($newEntry['canonicalUrlArgs'], 'chapter', $chapterId); } $newEntry['submissionId'] = $submissionId; $newEntry['assocType'] = isset($chapter) ? Application::ASSOC_TYPE_CHAPTER : $assocType; $newEntry['chpaterId'] = isset($chapter) ? $chapter->getId() : null; + $newEntry['canonicalUrlPage'] = 'catalog'; + $newEntry['canonicalUrlOp'] = 'book'; break; case Application::ASSOC_TYPE_SUBMISSION_FILE: @@ -854,6 +882,7 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId/$representationId/$submissionFileId. @@ -866,6 +895,7 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v $publicationId = (int) $args[2]; $representationUrlPath = $args[3]; $submissionFileId = (int) $args[4]; + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } else { $representationUrlPath = $args[1]; $submissionFileId = (int) $args[2]; @@ -940,7 +970,7 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v } // is this a full text or supp file - $genreDao = DAORegistry::getDAO('GenreDAO'); + $genreDao = DAORegistry::getDAO('GenreDAO'); /** @var GenreDAO $genreDao */ $genre = $genreDao->getById($submissionFile->getData('genreId')); if ($genre->getCategory() != Genre::GENRE_CATEGORY_DOCUMENT || $genre->getSupplementary() || $genre->getDependent()) { $newEntry['assocType'] = Application::ASSOC_TYPE_SUBMISSION_FILE_COUNTER_OTHER; @@ -952,6 +982,10 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['submissionFileId'] = $submissionFileId; $newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype')); $newEntry['chapterId'] = $submissionFile->getData('chapterId'); + + $newEntry['canonicalUrlPage'] = 'book'; + $newEntry['canonicalUrlOp'] = 'download'; + array_push($newEntry['canonicalUrlArgs'], $representationId, $submissionFileId); break; case Application::ASSOC_TYPE_SERIES: @@ -967,6 +1001,10 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v } $newEntry['seriesId'] = $series->getId(); $newEntry['assocType'] = $assocType; + + $newEntry['canonicalUrlPage'] = 'catalog'; + $newEntry['canonicalUrlOp'] = 'series'; + $newEntry['canonicalUrlArgs'] = [$seriesPath]; break; } } @@ -977,10 +1015,17 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v */ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): void { + $newEntry['submissionId'] = null; + $newEntry['representationId'] = null; + $newEntry['submissionFileId'] = null; + $newEntry['fileType'] = null; + switch ($assocType) { case Application::getContextAssocType(): // $newEntry['contextId'] has already been set $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlPage'] = Application::SITE_CONTEXT_PATH; + $newEntry['canonicalUrlOp'] = ''; break; case Application::ASSOC_TYPE_SUBMISSION: @@ -995,6 +1040,7 @@ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId. @@ -1008,9 +1054,12 @@ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): v fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL); break; } + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } $newEntry['submissionId'] = $submissionId; $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlPage'] = 'preprint'; + $newEntry['canonicalUrlOp'] = 'view'; break; case Application::ASSOC_TYPE_SUBMISSION_FILE: @@ -1033,6 +1082,7 @@ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId/$representationId/$submissionFileId. @@ -1045,6 +1095,7 @@ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): v $publicationId = (int) $args[2]; $representationUrlPath = $args[3]; $submissionFileId = (int) $args[4]; + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } else { $representationUrlPath = $args[1]; $submissionFileId = (int) $args[2]; @@ -1115,7 +1166,7 @@ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): v } // is this a full text or supp file - $genreDao = DAORegistry::getDAO('GenreDAO'); + $genreDao = DAORegistry::getDAO('GenreDAO'); /** @var GenreDao $genreDao */ $genre = $genreDao->getById($submissionFile->getData('genreId')); if ($genre->getCategory() != Genre::GENRE_CATEGORY_DOCUMENT || $genre->getSupplementary() || $genre->getDependent()) { $newEntry['assocType'] = Application::ASSOC_TYPE_SUBMISSION_FILE_COUNTER_OTHER; @@ -1126,6 +1177,10 @@ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['representationId'] = $representationId; $newEntry['submissionFileId'] = $submissionFileId; $newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype')); + + $newEntry['canonicalUrlPage'] = 'preprint'; + $newEntry['canonicalUrlOp'] = 'download'; + array_push($newEntry['canonicalUrlArgs'], $representationId, $submissionFileId); break; } }