Skip to content

Commit

Permalink
Merge main into sweep/add-sweep-config
Browse files Browse the repository at this point in the history
  • Loading branch information
sweep-ai[bot] authored Dec 6, 2023
2 parents a0e6c92 + 491efa1 commit 01d6409
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 132 deletions.
26 changes: 9 additions & 17 deletions scrapy_crawler/Bungae/TotalSearch/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from scrapy.exceptions import DropItem

from scrapy_crawler.Bungae.TotalSearch.spiders.BgKeywordSpider import BgKeywordSpider
from scrapy_crawler.common.db import RawUsedItem
from scrapy_crawler.common.db.models import LogCrawler
from scrapy_crawler.common.enums import SourceEnum
from scrapy_crawler.common.utils.constants import BunJang
from scrapy_crawler.common.utils.custom_exceptions import (
DropDuplicateItem,
Expand All @@ -11,9 +12,9 @@
DropTooLowPriceItem,
)
from scrapy_crawler.common.utils.helpers import (
get_local_timestring,
has_forbidden_keyword,
publish_sqs_message,
save_image_from_url,
too_long_text,
too_low_price,
)
Expand All @@ -27,11 +28,8 @@ def __init__(self):

def has_duplicate(self, item):
return (
self.session.query(RawUsedItem)
.filter(
RawUsedItem.url == BunJang.ARTICLE_URL % str(item["pid"])
or RawUsedItem.title == item["title"]
)
self.session.query(LogCrawler)
.filter(LogCrawler.url == BunJang.ARTICLE_URL % str(item["pid"]))
.first()
is not None
)
Expand Down Expand Up @@ -98,17 +96,11 @@ def process_item(self, item, spider: BgKeywordSpider):
spider.logger.info(f"[{type(self).__name__}] start process_item {item['pid']}")
try:
self.session.add(
RawUsedItem(
writer=item["writer"],
title=item["title"],
content=item["content"],
price=item["price"],
date=item["date"],
source=item["source"],
LogCrawler(
url=BunJang.ARTICLE_URL % str(item["pid"]),
img_url=item["img_url"],
image=save_image_from_url(item["img_url"]).getvalue(),
raw_json=item["raw_json"],
source=SourceEnum.BUNGAE.value,
created_at=get_local_timestring(),
item_status="CRAWLED",
)
)

Expand Down
64 changes: 20 additions & 44 deletions scrapy_crawler/Bungae/TotalSearch/spiders/BgKeywordSpider.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,20 @@
from typing import List
from urllib import parse

import boto3
import scrapy
from scrapy import signals
from scrapy.utils.project import get_project_settings
from sqlalchemy.orm import sessionmaker
from twisted.python.failure import Failure

from scrapy_crawler.Bungae.metadata.article import ArticleRoot
from scrapy_crawler.Bungae.metadata.total_search import BgList, TotalSearchRoot
from scrapy_crawler.Bungae.TotalSearch.items import ArticleItem
from scrapy_crawler.common.db import get_engine
from scrapy_crawler.common.db.models import DroppedItem, RawUsedItem
from scrapy_crawler.common.db.models import LogCrawler
from scrapy_crawler.common.enums import SourceEnum
from scrapy_crawler.common.slack.SlackBots import ExceptionSlackBot
from scrapy_crawler.common.utils.constants import BunJang
from scrapy_crawler.common.utils.helpers import (
exception_to_category_code,
get_local_timestring,
)
from scrapy_crawler.common.utils.helpers import get_local_timestring


class BgKeywordSpider(scrapy.Spider):
Expand All @@ -29,24 +24,24 @@ class BgKeywordSpider(scrapy.Spider):
"ITEM_PIPELINES": {
"scrapy_crawler.Bungae.TotalSearch.pipelines.DuplicateFilterPipeline": 1,
"scrapy_crawler.Bungae.TotalSearch.pipelines.ManualFilterPipeline": 2,
"scrapy_crawler.Bungae.TotalSearch.pipelines.PublishSQSPipeline": 3,
# "scrapy_crawler.Bungae.TotalSearch.pipelines.PublishSQSPipeline": 3,
"scrapy_crawler.Bungae.TotalSearch.pipelines.PostgresExportPipeline": 4,
}
}

def __init__(self, keyword=None, *args, **kwargs):
super().__init__(*args, **kwargs)
settings = get_project_settings()
sqs = boto3.resource(
"sqs",
aws_access_key_id=settings["AWS_ACCESS_KEY_ID"],
aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"],
region_name=settings["AWS_REGION_NAME"],
)

self.live_queue = sqs.get_queue_by_name(
QueueName=settings["AWS_LIVE_QUEUE_NAME"]
)
# settings = get_project_settings()
# sqs = boto3.resource(
# "sqs",
# aws_access_key_id=settings["AWS_ACCESS_KEY_ID"],
# aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"],
# region_name=settings["AWS_REGION_NAME"],
# )
#
# self.live_queue = sqs.get_queue_by_name(
# QueueName=settings["AWS_LIVE_QUEUE_NAME"]
# )
self.session = sessionmaker(bind=get_engine())()
self.exception_slack_bot: ExceptionSlackBot = ExceptionSlackBot()
self.keyword = keyword
Expand All @@ -67,39 +62,22 @@ def item_error(self, item, response, spider, failure: Failure):
spider.name, failure.getErrorMessage()
)

def item_already_dropped(self, url) -> bool:
return (
self.session.query(DroppedItem)
.filter(DroppedItem.source == SourceEnum.BUNGAE.value)
.filter(DroppedItem.url == url)
.first()
is not None
)

def item_already_crawled(self, url) -> bool:
return (
self.session.query(RawUsedItem)
.filter(RawUsedItem.url == url)
.filter(RawUsedItem.source == SourceEnum.BUNGAE.value)
.first()
self.session.query(LogCrawler).filter(LogCrawler.url == url).first()
is not None
)

def item_dropped(self, item, response, exception, spider):
if self.item_already_dropped(BunJang.ARTICLE_URL % str(item["pid"])) or (
(category_code := exception_to_category_code(exception)) is None
):
return

self.logger.info(f"Item dropped: {exception.__class__.__name__}")

try:
self.session.add(
DroppedItem(
source=SourceEnum.BUNGAE.value,
category=category_code,
LogCrawler(
url=BunJang.ARTICLE_URL % str(item["pid"]),
dropped_at=get_local_timestring(),
crawled_at=get_local_timestring(),
source=SourceEnum.BUNGAE.value,
item_status=f"DROPPED_{exception.__class__.__name__}",
)
)

Expand All @@ -120,9 +98,7 @@ def parse(self, response):

for article in articles:
article_url = BunJang.ARTICLE_URL % article.pid
if self.item_already_crawled(article_url) or self.item_already_dropped(
article_url
):
if self.item_already_crawled(article_url):
continue

yield scrapy.Request(
Expand Down
35 changes: 9 additions & 26 deletions scrapy_crawler/Joonggonara/TotalSearch/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,9 @@
from itemadapter import ItemAdapter
from scrapy import Selector

from scrapy_crawler.common.db.models import RawUsedItem
from scrapy_crawler.common.utils import (
has_forbidden_keyword,
save_image_from_url,
too_low_price,
)
from scrapy_crawler.common.db.models import LogCrawler
from scrapy_crawler.common.enums import SourceEnum
from scrapy_crawler.common.utils import has_forbidden_keyword, too_low_price
from scrapy_crawler.common.utils.custom_exceptions import (
DropDuplicateItem,
DropForbiddenKeywordItem,
Expand Down Expand Up @@ -65,9 +62,6 @@ def process_item(self, item, spider):

item["content"] = content

# images = selector.css(".se-image-resource::attr(src)").getall()
# item["content"] = content + "\n[Image URLS]\n" + "\n".join(images)

return item


Expand All @@ -80,11 +74,8 @@ def __init__(self):
def is_duplicated(self, adapter: ItemAdapter) -> bool:
try:
item = (
self.session.query(RawUsedItem)
.filter(
RawUsedItem.url == adapter["url"]
or RawUsedItem.title == adapter["title"]
)
self.session.query(LogCrawler)
.filter(LogCrawler.url == adapter["url"])
.first()
)
return item is not None
Expand Down Expand Up @@ -172,20 +163,12 @@ def process_item(self, item, spider: JgKeywordSpider):
adapter = ItemAdapter(item)

try:
image = save_image_from_url(adapter["img_url"] + "?type=w300")

self.session.add(
RawUsedItem(
writer=adapter["writer"],
title=adapter["title"],
content=adapter["content"],
price=adapter["price"],
date=adapter["date"],
LogCrawler(
item_status="CRAWLED",
source=SourceEnum.JOONGGONARA.value,
url=adapter["url"],
img_url=adapter["img_url"],
source=adapter["source"],
image=image.getvalue(),
raw_json=adapter["raw_json"],
created_at=adapter["date"],
)
)
self.session.commit()
Expand Down
63 changes: 20 additions & 43 deletions scrapy_crawler/Joonggonara/TotalSearch/spiders/JgKeywordSpider.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,19 @@
import json
from urllib import parse

import boto3
# import boto3
import scrapy
from scrapy import signals
from scrapy.utils.project import get_project_settings
from sqlalchemy.orm import sessionmaker
from twisted.python.failure import Failure

from scrapy_crawler.common.db import get_engine
from scrapy_crawler.common.db.models import DroppedItem, RawUsedItem
from scrapy_crawler.common.db.models import LogCrawler
from scrapy_crawler.common.enums import SourceEnum
from scrapy_crawler.common.slack.SlackBots import ExceptionSlackBot
from scrapy_crawler.common.utils import to_local_timestring
from scrapy_crawler.common.utils.constants import Joonggonara
from scrapy_crawler.common.utils.helpers import (
exception_to_category_code,
get_local_timestring,
)
from scrapy_crawler.common.utils.helpers import get_local_timestring
from scrapy_crawler.Joonggonara.metadata.article import ArticleRoot
from scrapy_crawler.Joonggonara.metadata.total_search import TotalSearchRoot
from scrapy_crawler.Joonggonara.TotalSearch.items import ArticleItem
Expand All @@ -32,24 +28,24 @@ class JgKeywordSpider(scrapy.Spider):
"scrapy_crawler.Joonggonara.TotalSearch.pipelines.HtmlParserPipeline": 1,
"scrapy_crawler.Joonggonara.TotalSearch.pipelines.ManualFilterPipeline": 2,
"scrapy_crawler.Joonggonara.TotalSearch.pipelines.DuplicateFilterPipeline": 3,
"scrapy_crawler.Joonggonara.TotalSearch.pipelines.PublishSQSPipeline": 4,
# "scrapy_crawler.Joonggonara.TotalSearch.pipelines.PublishSQSPipeline": 4,
"scrapy_crawler.Joonggonara.TotalSearch.pipelines.PostgresExportPipeline": 5,
}
}

def __init__(self, keyword=None, *args, **kwargs):
super().__init__(*args, **kwargs)
settings = get_project_settings()
sqs = boto3.resource(
"sqs",
aws_access_key_id=settings["AWS_ACCESS_KEY_ID"],
aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"],
region_name=settings["AWS_REGION_NAME"],
)

self.live_queue = sqs.get_queue_by_name(
QueueName=settings["AWS_LIVE_QUEUE_NAME"]
)
# settings = get_project_settings()
# sqs = boto3.resource(
# "sqs",
# aws_access_key_id=settings["AWS_ACCESS_KEY_ID"],
# aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"],
# region_name=settings["AWS_REGION_NAME"],
# )
#
# self.live_queue = sqs.get_queue_by_name(
# QueueName=settings["AWS_LIVE_QUEUE_NAME"]
# )
self.session = sessionmaker(bind=get_engine())()
self.exception_slack_bot: ExceptionSlackBot = ExceptionSlackBot()
self.keyword = keyword
Expand All @@ -70,39 +66,22 @@ def item_error(self, item, response, spider, failure: Failure):
spider.name, failure.getErrorMessage()
)

def item_already_dropped(self, url) -> bool:
return (
self.session.query(DroppedItem)
.filter(DroppedItem.source == SourceEnum.JOONGGONARA.value)
.filter(DroppedItem.url == url)
.first()
is not None
)

def item_already_crawled(self, url) -> bool:
return (
self.session.query(RawUsedItem)
.filter(RawUsedItem.url == url)
.filter(RawUsedItem.source == SourceEnum.JOONGGONARA.value)
.first()
self.session.query(LogCrawler).filter(LogCrawler.url == url).first()
is not None
)

def item_dropped(self, item, response, exception, spider):
if self.item_already_dropped(item["url"]) or (
(category_code := exception_to_category_code(exception)) is None
):
return

self.logger.info(f"Item dropped: {exception.__class__.__name__}")

try:
self.session.add(
DroppedItem(
LogCrawler(
source=SourceEnum.JOONGGONARA.value,
category=category_code,
item_status=f"DROPPED_{exception.__class__.__name__}",
url=item["url"],
dropped_at=get_local_timestring(),
created_at=get_local_timestring(),
)
)

Expand Down Expand Up @@ -134,9 +113,7 @@ def parse(self, response):
for article in target_articles:
article_url = Joonggonara.ARTICLE_URL % article.articleId

if self.item_already_crawled(article_url) or self.item_already_dropped(
article_url
):
if self.item_already_crawled(article_url):
continue

yield scrapy.Request(
Expand Down
4 changes: 2 additions & 2 deletions scrapy_crawler/common/chatgpt/chains.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def create_llm_chain(model_name: str, template_path: str, input_variables: list[
GPT4_MODEL_NAME = "gpt-4-0613"
GPT3_MODEL_NAME = "gpt-3.5-turbo-0613"
unused_chain: LLMChain = create_llm_chain(
GPT4_MODEL_NAME, PREFIX % "unused.txt", ["title", "content"]
GPT3_MODEL_NAME, PREFIX % "unused.txt", ["title", "content"]
)
apple_care_plus_chain: LLMChain = create_llm_chain(
GPT3_MODEL_NAME, PREFIX % "apple_care_plus.txt", ["title", "content"]
Expand Down Expand Up @@ -64,7 +64,7 @@ def create_llm_chain(model_name: str, template_path: str, input_variables: list[
GPT3_MODEL_NAME, PREFIX % "ipad_system.txt", ["title", "content", "default_ssd"]
)
ipad_cellular_chain: LLMChain = create_llm_chain(
GPT4_MODEL_NAME, PREFIX % "ipad_cellular.txt", ["title", "content"]
GPT3_MODEL_NAME, PREFIX % "ipad_cellular.txt", ["title", "content"]
)

iphone_generation_chain = create_llm_chain(
Expand Down
9 changes: 9 additions & 0 deletions scrapy_crawler/common/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,12 @@ class DroppedItem(Base):
source = Column("source", String)
url = Column("url", String)
dropped_at = Column("dropped_at", String)


class LogCrawler(Base):
__tablename__ = "log_crawler"
id = Column(Integer, primary_key=True)
item_status = Column("item_status", String)
source = Column("source", String)
url = Column("url", String)
created_at = Column("created_at", String)

0 comments on commit 01d6409

Please sign in to comment.