From 836b46e9f03ab8125dc9d935951aaa97ef123074 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=ED=99=A9=EA=B7=9C=EB=8F=84?= Date: Sat, 9 Sep 2023 14:12:22 +0900 Subject: [PATCH 1/6] chore: Add aws-default-region --- scrapy_crawler/settings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapy_crawler/settings.py b/scrapy_crawler/settings.py index 718841c..95eeabe 100644 --- a/scrapy_crawler/settings.py +++ b/scrapy_crawler/settings.py @@ -23,6 +23,7 @@ AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY") AWS_REGION_NAME = os.environ.get("AWS_REGION_NAME") +AWS_DEFAULT_REGION = os.environ.get("AWS_REGION_NAME") AWS_LIVE_QUEUE_NAME = os.environ.get("AWS_LIVE_QUEUE_NAME") # Crawl responsibly by identifying yourself (and your website) on the user-agent From 98e7e391fdc528b216728d2c4c9422f3a80f24d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=ED=99=A9=EA=B7=9C=EB=8F=84?= Date: Fri, 15 Sep 2023 21:03:05 +0900 Subject: [PATCH 2/6] hotfix: Update All items --- .../DBWatchDog/DailyScheduler/spiders/PriceUpdateSpider.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapy_crawler/DBWatchDog/DailyScheduler/spiders/PriceUpdateSpider.py b/scrapy_crawler/DBWatchDog/DailyScheduler/spiders/PriceUpdateSpider.py index e3710a2..cbf8931 100644 --- a/scrapy_crawler/DBWatchDog/DailyScheduler/spiders/PriceUpdateSpider.py +++ b/scrapy_crawler/DBWatchDog/DailyScheduler/spiders/PriceUpdateSpider.py @@ -44,7 +44,6 @@ def get_soldout_items(self) -> List[Deal]: self.session.query(Deal) .filter(Deal.sold == true()) .filter(Deal.deleted_at == null()) - .filter(Deal.condition == "S") ) return item.all() From b8ba5f58d324098ce117610d305e569185819bcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=ED=99=A9=EA=B7=9C=EB=8F=84?= Date: Fri, 13 Oct 2023 20:19:23 +0900 Subject: [PATCH 3/6] hotfix: downgrade gpt version --- scrapy_crawler/common/chatgpt/chains.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_crawler/common/chatgpt/chains.py b/scrapy_crawler/common/chatgpt/chains.py index d400ed0..8204fc3 100644 --- a/scrapy_crawler/common/chatgpt/chains.py +++ b/scrapy_crawler/common/chatgpt/chains.py @@ -22,7 +22,7 @@ def create_llm_chain(model_name: str, template_path: str, input_variables: list[ GPT4_MODEL_NAME = "gpt-4-0613" GPT3_MODEL_NAME = "gpt-3.5-turbo-0613" unused_chain: LLMChain = create_llm_chain( - GPT4_MODEL_NAME, PREFIX % "unused.txt", ["title", "content"] + GPT3_MODEL_NAME, PREFIX % "unused.txt", ["title", "content"] ) apple_care_plus_chain: LLMChain = create_llm_chain( GPT3_MODEL_NAME, PREFIX % "apple_care_plus.txt", ["title", "content"] @@ -64,7 +64,7 @@ def create_llm_chain(model_name: str, template_path: str, input_variables: list[ GPT3_MODEL_NAME, PREFIX % "ipad_system.txt", ["title", "content", "default_ssd"] ) ipad_cellular_chain: LLMChain = create_llm_chain( - GPT4_MODEL_NAME, PREFIX % "ipad_cellular.txt", ["title", "content"] + GPT3_MODEL_NAME, PREFIX % "ipad_cellular.txt", ["title", "content"] ) iphone_generation_chain = create_llm_chain( From 610d6834ad6b7413a48a1537b85437427c9d930d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=ED=99=A9=EA=B7=9C=EB=8F=84?= Date: Sun, 12 Nov 2023 14:51:16 +0900 Subject: [PATCH 4/6] refactoring: Publish to SQS --- .../Bungae/TotalSearch/pipelines.py | 26 +++++-------- .../TotalSearch/spiders/BgKeywordSpider.py | 38 ++++--------------- .../Joonggonara/TotalSearch/pipelines.py | 35 +++++------------ .../TotalSearch/spiders/JgKeywordSpider.py | 36 ++++-------------- scrapy_crawler/common/db/models.py | 9 +++++ 5 files changed, 42 insertions(+), 102 deletions(-) diff --git a/scrapy_crawler/Bungae/TotalSearch/pipelines.py b/scrapy_crawler/Bungae/TotalSearch/pipelines.py index 1cf1749..9f96c14 100644 --- a/scrapy_crawler/Bungae/TotalSearch/pipelines.py +++ b/scrapy_crawler/Bungae/TotalSearch/pipelines.py @@ -2,7 +2,8 @@ from scrapy.exceptions import DropItem from scrapy_crawler.Bungae.TotalSearch.spiders.BgKeywordSpider import BgKeywordSpider -from scrapy_crawler.common.db import RawUsedItem +from scrapy_crawler.common.db.models import LogCrawler +from scrapy_crawler.common.enums import SourceEnum from scrapy_crawler.common.utils.constants import BunJang from scrapy_crawler.common.utils.custom_exceptions import ( DropDuplicateItem, @@ -11,9 +12,9 @@ DropTooLowPriceItem, ) from scrapy_crawler.common.utils.helpers import ( + get_local_timestring, has_forbidden_keyword, publish_sqs_message, - save_image_from_url, too_long_text, too_low_price, ) @@ -27,11 +28,8 @@ def __init__(self): def has_duplicate(self, item): return ( - self.session.query(RawUsedItem) - .filter( - RawUsedItem.url == BunJang.ARTICLE_URL % str(item["pid"]) - or RawUsedItem.title == item["title"] - ) + self.session.query(LogCrawler) + .filter(LogCrawler.url == BunJang.ARTICLE_URL % str(item["pid"])) .first() is not None ) @@ -98,17 +96,11 @@ def process_item(self, item, spider: BgKeywordSpider): spider.logger.info(f"[{type(self).__name__}] start process_item {item['pid']}") try: self.session.add( - RawUsedItem( - writer=item["writer"], - title=item["title"], - content=item["content"], - price=item["price"], - date=item["date"], - source=item["source"], + LogCrawler( url=BunJang.ARTICLE_URL % str(item["pid"]), - img_url=item["img_url"], - image=save_image_from_url(item["img_url"]).getvalue(), - raw_json=item["raw_json"], + source=SourceEnum.BUNGAE.value, + created_at=get_local_timestring(), + item_status="CRAWLED", ) ) diff --git a/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py b/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py index 3bd5761..4510985 100644 --- a/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py +++ b/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py @@ -13,14 +13,11 @@ from scrapy_crawler.Bungae.metadata.total_search import BgList, TotalSearchRoot from scrapy_crawler.Bungae.TotalSearch.items import ArticleItem from scrapy_crawler.common.db import get_engine -from scrapy_crawler.common.db.models import DroppedItem, RawUsedItem +from scrapy_crawler.common.db.models import LogCrawler from scrapy_crawler.common.enums import SourceEnum from scrapy_crawler.common.slack.SlackBots import ExceptionSlackBot from scrapy_crawler.common.utils.constants import BunJang -from scrapy_crawler.common.utils.helpers import ( - exception_to_category_code, - get_local_timestring, -) +from scrapy_crawler.common.utils.helpers import get_local_timestring class BgKeywordSpider(scrapy.Spider): @@ -67,39 +64,22 @@ def item_error(self, item, response, spider, failure: Failure): spider.name, failure.getErrorMessage() ) - def item_already_dropped(self, url) -> bool: - return ( - self.session.query(DroppedItem) - .filter(DroppedItem.source == SourceEnum.BUNGAE.value) - .filter(DroppedItem.url == url) - .first() - is not None - ) - def item_already_crawled(self, url) -> bool: return ( - self.session.query(RawUsedItem) - .filter(RawUsedItem.url == url) - .filter(RawUsedItem.source == SourceEnum.BUNGAE.value) - .first() + self.session.query(LogCrawler).filter(LogCrawler.url == url).first() is not None ) def item_dropped(self, item, response, exception, spider): - if self.item_already_dropped(BunJang.ARTICLE_URL % str(item["pid"])) or ( - (category_code := exception_to_category_code(exception)) is None - ): - return - self.logger.info(f"Item dropped: {exception.__class__.__name__}") try: self.session.add( - DroppedItem( - source=SourceEnum.BUNGAE.value, - category=category_code, + LogCrawler( url=BunJang.ARTICLE_URL % str(item["pid"]), - dropped_at=get_local_timestring(), + crawled_at=get_local_timestring(), + source=SourceEnum.BUNGAE.value, + item_status=f"DROPPED_{exception.__class__.__name__}", ) ) @@ -120,9 +100,7 @@ def parse(self, response): for article in articles: article_url = BunJang.ARTICLE_URL % article.pid - if self.item_already_crawled(article_url) or self.item_already_dropped( - article_url - ): + if self.item_already_crawled(article_url): continue yield scrapy.Request( diff --git a/scrapy_crawler/Joonggonara/TotalSearch/pipelines.py b/scrapy_crawler/Joonggonara/TotalSearch/pipelines.py index c13cf25..4404b1f 100644 --- a/scrapy_crawler/Joonggonara/TotalSearch/pipelines.py +++ b/scrapy_crawler/Joonggonara/TotalSearch/pipelines.py @@ -4,12 +4,9 @@ from itemadapter import ItemAdapter from scrapy import Selector -from scrapy_crawler.common.db.models import RawUsedItem -from scrapy_crawler.common.utils import ( - has_forbidden_keyword, - save_image_from_url, - too_low_price, -) +from scrapy_crawler.common.db.models import LogCrawler +from scrapy_crawler.common.enums import SourceEnum +from scrapy_crawler.common.utils import has_forbidden_keyword, too_low_price from scrapy_crawler.common.utils.custom_exceptions import ( DropDuplicateItem, DropForbiddenKeywordItem, @@ -65,9 +62,6 @@ def process_item(self, item, spider): item["content"] = content - # images = selector.css(".se-image-resource::attr(src)").getall() - # item["content"] = content + "\n[Image URLS]\n" + "\n".join(images) - return item @@ -80,11 +74,8 @@ def __init__(self): def is_duplicated(self, adapter: ItemAdapter) -> bool: try: item = ( - self.session.query(RawUsedItem) - .filter( - RawUsedItem.url == adapter["url"] - or RawUsedItem.title == adapter["title"] - ) + self.session.query(LogCrawler) + .filter(LogCrawler.url == adapter["url"]) .first() ) return item is not None @@ -172,20 +163,12 @@ def process_item(self, item, spider: JgKeywordSpider): adapter = ItemAdapter(item) try: - image = save_image_from_url(adapter["img_url"] + "?type=w300") - self.session.add( - RawUsedItem( - writer=adapter["writer"], - title=adapter["title"], - content=adapter["content"], - price=adapter["price"], - date=adapter["date"], + LogCrawler( + item_status="CRAWLED", + source=SourceEnum.JOONGGONARA.value, url=adapter["url"], - img_url=adapter["img_url"], - source=adapter["source"], - image=image.getvalue(), - raw_json=adapter["raw_json"], + created_at=adapter["date"], ) ) self.session.commit() diff --git a/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py b/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py index 4030026..f3e612b 100644 --- a/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py +++ b/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py @@ -10,15 +10,12 @@ from twisted.python.failure import Failure from scrapy_crawler.common.db import get_engine -from scrapy_crawler.common.db.models import DroppedItem, RawUsedItem +from scrapy_crawler.common.db.models import LogCrawler from scrapy_crawler.common.enums import SourceEnum from scrapy_crawler.common.slack.SlackBots import ExceptionSlackBot from scrapy_crawler.common.utils import to_local_timestring from scrapy_crawler.common.utils.constants import Joonggonara -from scrapy_crawler.common.utils.helpers import ( - exception_to_category_code, - get_local_timestring, -) +from scrapy_crawler.common.utils.helpers import get_local_timestring from scrapy_crawler.Joonggonara.metadata.article import ArticleRoot from scrapy_crawler.Joonggonara.metadata.total_search import TotalSearchRoot from scrapy_crawler.Joonggonara.TotalSearch.items import ArticleItem @@ -70,39 +67,22 @@ def item_error(self, item, response, spider, failure: Failure): spider.name, failure.getErrorMessage() ) - def item_already_dropped(self, url) -> bool: - return ( - self.session.query(DroppedItem) - .filter(DroppedItem.source == SourceEnum.JOONGGONARA.value) - .filter(DroppedItem.url == url) - .first() - is not None - ) - def item_already_crawled(self, url) -> bool: return ( - self.session.query(RawUsedItem) - .filter(RawUsedItem.url == url) - .filter(RawUsedItem.source == SourceEnum.JOONGGONARA.value) - .first() + self.session.query(LogCrawler).filter(LogCrawler.url == url).first() is not None ) def item_dropped(self, item, response, exception, spider): - if self.item_already_dropped(item["url"]) or ( - (category_code := exception_to_category_code(exception)) is None - ): - return - self.logger.info(f"Item dropped: {exception.__class__.__name__}") try: self.session.add( - DroppedItem( + LogCrawler( source=SourceEnum.JOONGGONARA.value, - category=category_code, + item_status=f"DROPPED_{exception.__class__.__name__}", url=item["url"], - dropped_at=get_local_timestring(), + created_at=get_local_timestring(), ) ) @@ -134,9 +114,7 @@ def parse(self, response): for article in target_articles: article_url = Joonggonara.ARTICLE_URL % article.articleId - if self.item_already_crawled(article_url) or self.item_already_dropped( - article_url - ): + if self.item_already_crawled(article_url): continue yield scrapy.Request( diff --git a/scrapy_crawler/common/db/models.py b/scrapy_crawler/common/db/models.py index 79906f1..a78ab53 100644 --- a/scrapy_crawler/common/db/models.py +++ b/scrapy_crawler/common/db/models.py @@ -143,3 +143,12 @@ class DroppedItem(Base): source = Column("source", String) url = Column("url", String) dropped_at = Column("dropped_at", String) + + +class LogCrawler(Base): + __tablename__ = "log_crawler" + id = Column(Integer, primary_key=True) + item_status = Column("item_status", String) + source = Column("source", String) + url = Column("url", String) + created_at = Column("created_at", String) From 48e856e19828e9ce4455c49fa757d2b4745a306d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=ED=99=A9=EA=B7=9C=EB=8F=84?= Date: Fri, 17 Nov 2023 22:15:08 +0900 Subject: [PATCH 5/6] feat: Disable SQS publish for while --- scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py | 2 +- .../Joonggonara/TotalSearch/spiders/JgKeywordSpider.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py b/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py index 4510985..622d4c0 100644 --- a/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py +++ b/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py @@ -26,7 +26,7 @@ class BgKeywordSpider(scrapy.Spider): "ITEM_PIPELINES": { "scrapy_crawler.Bungae.TotalSearch.pipelines.DuplicateFilterPipeline": 1, "scrapy_crawler.Bungae.TotalSearch.pipelines.ManualFilterPipeline": 2, - "scrapy_crawler.Bungae.TotalSearch.pipelines.PublishSQSPipeline": 3, + # "scrapy_crawler.Bungae.TotalSearch.pipelines.PublishSQSPipeline": 3, "scrapy_crawler.Bungae.TotalSearch.pipelines.PostgresExportPipeline": 4, } } diff --git a/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py b/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py index f3e612b..b780ae4 100644 --- a/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py +++ b/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py @@ -29,7 +29,7 @@ class JgKeywordSpider(scrapy.Spider): "scrapy_crawler.Joonggonara.TotalSearch.pipelines.HtmlParserPipeline": 1, "scrapy_crawler.Joonggonara.TotalSearch.pipelines.ManualFilterPipeline": 2, "scrapy_crawler.Joonggonara.TotalSearch.pipelines.DuplicateFilterPipeline": 3, - "scrapy_crawler.Joonggonara.TotalSearch.pipelines.PublishSQSPipeline": 4, + # "scrapy_crawler.Joonggonara.TotalSearch.pipelines.PublishSQSPipeline": 4, "scrapy_crawler.Joonggonara.TotalSearch.pipelines.PostgresExportPipeline": 5, } } From 750fba040b91a1b2a87799611a98c4163cdb31d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=ED=99=A9=EA=B7=9C=EB=8F=84?= Date: Wed, 6 Dec 2023 13:26:17 +0900 Subject: [PATCH 6/6] feat: Disable SQS call for temp --- .../TotalSearch/spiders/BgKeywordSpider.py | 24 ++++++++---------- .../TotalSearch/spiders/JgKeywordSpider.py | 25 +++++++++---------- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py b/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py index 622d4c0..6231f43 100644 --- a/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py +++ b/scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py @@ -2,10 +2,8 @@ from typing import List from urllib import parse -import boto3 import scrapy from scrapy import signals -from scrapy.utils.project import get_project_settings from sqlalchemy.orm import sessionmaker from twisted.python.failure import Failure @@ -33,17 +31,17 @@ class BgKeywordSpider(scrapy.Spider): def __init__(self, keyword=None, *args, **kwargs): super().__init__(*args, **kwargs) - settings = get_project_settings() - sqs = boto3.resource( - "sqs", - aws_access_key_id=settings["AWS_ACCESS_KEY_ID"], - aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"], - region_name=settings["AWS_REGION_NAME"], - ) - - self.live_queue = sqs.get_queue_by_name( - QueueName=settings["AWS_LIVE_QUEUE_NAME"] - ) + # settings = get_project_settings() + # sqs = boto3.resource( + # "sqs", + # aws_access_key_id=settings["AWS_ACCESS_KEY_ID"], + # aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"], + # region_name=settings["AWS_REGION_NAME"], + # ) + # + # self.live_queue = sqs.get_queue_by_name( + # QueueName=settings["AWS_LIVE_QUEUE_NAME"] + # ) self.session = sessionmaker(bind=get_engine())() self.exception_slack_bot: ExceptionSlackBot = ExceptionSlackBot() self.keyword = keyword diff --git a/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py b/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py index b780ae4..a6882a8 100644 --- a/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py +++ b/scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py @@ -2,10 +2,9 @@ import json from urllib import parse -import boto3 +# import boto3 import scrapy from scrapy import signals -from scrapy.utils.project import get_project_settings from sqlalchemy.orm import sessionmaker from twisted.python.failure import Failure @@ -36,17 +35,17 @@ class JgKeywordSpider(scrapy.Spider): def __init__(self, keyword=None, *args, **kwargs): super().__init__(*args, **kwargs) - settings = get_project_settings() - sqs = boto3.resource( - "sqs", - aws_access_key_id=settings["AWS_ACCESS_KEY_ID"], - aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"], - region_name=settings["AWS_REGION_NAME"], - ) - - self.live_queue = sqs.get_queue_by_name( - QueueName=settings["AWS_LIVE_QUEUE_NAME"] - ) + # settings = get_project_settings() + # sqs = boto3.resource( + # "sqs", + # aws_access_key_id=settings["AWS_ACCESS_KEY_ID"], + # aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"], + # region_name=settings["AWS_REGION_NAME"], + # ) + # + # self.live_queue = sqs.get_queue_by_name( + # QueueName=settings["AWS_LIVE_QUEUE_NAME"] + # ) self.session = sessionmaker(bind=get_engine())() self.exception_slack_bot: ExceptionSlackBot = ExceptionSlackBot() self.keyword = keyword