diff --git a/adapter/__init__.py b/adapter/__init__.py index c47de45..399f057 100644 --- a/adapter/__init__.py +++ b/adapter/__init__.py @@ -15,6 +15,7 @@ from adapter.fictionPressAdapter import FictionPressAdapter from adapter.hpFanficArchiveAdapter import HpFanficArchiveAdapter from adapter.hpffAdapter import HarryPotterFanfictionAdapter +from adapter.mcstoriesAdapter import McStoriesAdapter from adapter.parahumansAdapter import ParahumansAdapter from adapter.portkeyArchiveAdapter import PortkeyArchiveAdapter from adapter.questionableQuestingAdapter import QuestionableQuestingAdapter @@ -45,6 +46,7 @@ def registerAdapters() -> None: adapters[FicType.sufficientvelocity] = SufficientVelocityAdapter() adapters[FicType.questionablequesting] = QuestionableQuestingAdapter() adapters[FicType.harrypotterfanfiction] = HarryPotterFanfictionAdapter() + adapters[FicType.mcstories] = McStoriesAdapter() adapters[FicType.parahumans] = ParahumansAdapter() adapters[FicType.adultfanfiction] = AdultFanfictionAdapter() adapters[FicType.fanficsme] = FanficsMeAdapter() diff --git a/adapter/mcstoriesAdapter.py b/adapter/mcstoriesAdapter.py new file mode 100644 index 0000000..3243426 --- /dev/null +++ b/adapter/mcstoriesAdapter.py @@ -0,0 +1,182 @@ +import re +import urllib.parse +from typing import Optional + +from bs4 import BeautifulSoup + +import scrape +import util +from adapter.adapter import Adapter, edumpContent +from htypes import FicType, FicId +from schema import OilTimestamp +from store import Fic, Language +from store import FicStatus + +story_index_path_re = re.compile(r"/(?P\w+)/index\.html") +story_chapter_path_re = re.compile(r"/(?P\w+)/(?P\w+)\.html") +relative_chapter_path_re = re.compile(r"(?P\w+)\.html") + + +def strip_nonnumeric(s): + return ''.join(c for c in s if c.isdigit()) + + +class McStoriesAdapter(Adapter): + def __init__(self) -> None: + super().__init__( + True, + "https://mcstories.com/", + "mcstories.com", + FicType.mcstories, + "mcstories", + ) + + def tryParseUrl(self, url: str) -> Optional[FicId]: + path = urllib.parse.urlparse(url).path + + if m := story_index_path_re.fullmatch(path): + lid = m.group("lid") + elif m := story_chapter_path_re.fullmatch(path): + lid = m.group("clid") + else: + return None + return FicId(self.ftype, lid) + + def constructUrl(self, fic, chapter=None): + if chapter is None: + path = f"/{fic.localId}/index.html" + else: + path = f"/{fic.localId}/{chapter.localId}.html" + return urllib.parse.urljoin(self.baseUrl, path) + + def create(self, fic: Fic) -> Fic: + + fic.url = self.constructUrl(fic) + + data = scrape.scrape(fic.url) + edumpContent(data["raw"], "mcstories") + fic = self.parseInfoInto(fic, data["raw"]) + fic.upsert() + + return fic + + def absolute_author_link(self, rel_author_link): + if rel_author_link.startswith('..'): + path = rel_author_link[len('..'):] + else: + path = rel_author_link + return urllib.parse.urljoin(self.baseUrl, path) + + def absolute_chapter_link(self, fic, rel_chapter_link): + path = f"{fic.localId}/{rel_chapter_link}" + return urllib.parse.urljoin(self.baseUrl, path) + + def get_chapter_meta(self, table, fic): + chapters = [] + for chapter_row_dict in util.dicts_from_table(table): + name_cell = chapter_row_dict['Chapter'] + words_cell = chapter_row_dict['Length'] + date_cell = chapter_row_dict['Added'] + + updated_cell = chapter_row_dict.get('Updated', date_cell) + relative_chapter_path = name_cell.find('a')['href'] + clid = relative_chapter_path_re.fullmatch(relative_chapter_path).group('clid') + published = util.parseDateAsUnix(date_cell.string, fic.fetched) + + if updated_cell.string and updated_cell.string.strip(): + updated = util.parseDateAsUnix(updated_cell.string, fic.fetched) + else: + updated = published + chapters.append({ + 'clid': clid, + 'title': name_cell.string, + 'chapter_link': self.absolute_chapter_link(fic, relative_chapter_path), + 'words': int(strip_nonnumeric(words_cell.string)), + 'published': published, + 'updated': updated, + }) + return chapters + + def parseInfoInto(self, fic: Fic, html: str): + soup = BeautifulSoup(html, "html.parser") + + fic.fetched = OilTimestamp.now() + fic.languageId = Language.getId("English") # All stories on mcstories are presumed english + fic.title = soup.find(class_="title").string.strip() + fic.description = soup.find(class_="synopsis").get_text().strip() + fic.ageRating = "M" # *EROTIC* Mind Control Story Archive + + date_strings = soup.find_all('h3', class_='dateline') + published_date_string = date_strings.pop(0).string[len("Added "):] # "Added 18 October 2014" + if date_strings: + updated_date_string = date_strings.pop(0).string[len("Updated "):] # "Updated 18 October 2014" + else: + updated_date_string = published_date_string + publishedUts = util.parseDateAsUnix(published_date_string, fic.fetched) + updatedUts = util.parseDateAsUnix(updated_date_string, fic.fetched) + fic.published = OilTimestamp(publishedUts) + + chapter_table = soup.find('table', class_='index') + if chapter_table: + chapters = self.get_chapter_meta(chapter_table, fic) + else: + chapter_div = soup.find('div', class_='chapter') + link = chapter_div.find('a') + relative_chapter_path = link['href'] + clid = relative_chapter_path_re.fullmatch(relative_chapter_path).group('clid') + + chapters = [ + { + 'clid': clid, + 'title': link.string, + 'chapter_link': self.absolute_chapter_link(fic, relative_chapter_path), + 'words': int(strip_nonnumeric(link.next_sibling)), + 'published': publishedUts, + 'updated': publishedUts + }] + + fic.chapterCount = len(chapters) + fic.wordCount = sum(chapter['words'] for chapter in chapters) + + fic.reviewCount = 0 + fic.favoriteCount = 0 + fic.followCount = 0 + + # The update date of the last chapter is the best estimate of this story's last update + all_updates = [updatedUts] + [chapter['updated'] for chapter in chapters] + fic.updated = OilTimestamp(max(all_updates)) + + fic.ficStatus = FicStatus.ongoing # TODO: No indication on this site. + + byline = soup.find("h3", class_="byline") + authorLink = self.absolute_author_link(byline.find("a")['href']) + + authorUrl = authorLink + author = byline.find("a").string + authorId = author # map pseudo to real? + self.setAuthor(fic, author, authorUrl, authorId) + fic.upsert() + + for cid, chapter_meta in enumerate(chapters, 1): + chap = fic.chapter(cid) + chap.url = chapter_meta['chapter_link'] + chap.localChapterId = chapter_meta['clid'] + chap.title = chapter_meta['title'] + chap.upsert() + + return fic + + # extract the html text which contains the story itself + def extractContent(self, fic: Fic, html: str) -> str: + soup = BeautifulSoup(html, "html.parser") + article = soup.find('article') + if article is None: + edumpContent(html, "mcstories_ec") + raise Exception("unable to find chapters, e-dumped") + + return str(article) + + def getCurrentInfo(self, fic: Fic) -> Fic: + fic.url = self.constructUrl(fic) + data = scrape.scrape(fic.url) + return self.parseInfoInto(fic, data["raw"]) diff --git a/alexandria_api_requirements.txt b/alexandria_api_requirements.txt index 03bfa0c..352b40f 100644 --- a/alexandria_api_requirements.txt +++ b/alexandria_api_requirements.txt @@ -8,7 +8,7 @@ idna==2.10 itsdangerous==2.0.0 Jinja2==3.0.0 MarkupSafe==2.0.0 -psycopg2==2.9.9 +psycopg2-binary==2.9.9 python-dateutil==2.8.1 requests==2.31.0 six==1.16.0 diff --git a/htypes.py b/htypes.py index eb08afd..6897421 100644 --- a/htypes.py +++ b/htypes.py @@ -36,6 +36,7 @@ class FicType(IntEnum): fanficparadisesfw = 24 fanficparadisensfw = 25 wanderinginn = 26 + mcstories = 27 def adaptFicType(ftype: FicType) -> AsIs: diff --git a/sql/minerva/addSources.sql b/sql/minerva/addSources.sql index 7c06cc6..9b43f68 100644 --- a/sql/minerva/addSources.sql +++ b/sql/minerva/addSources.sql @@ -24,6 +24,7 @@ insert into source(url, name, description) values ('https://thefanfictionforum.net/xenforo/index.php', 'The Fanfiction Forum', 'The Fanfiction Forum'), ('https://www.fanficparadise.com/fpforum-sfw/index.php', 'Fanfiction Paradise SFW', 'Fanfiction Paradise SFW'), ('https://www.fanficparadise.com/fpforum-nsfw/index.php', 'Fanfiction Paradise NSFW', 'Fanfiction Paradise NSFW'), - ('https://wanderinginn.com/', 'The Wandering Inn', 'A tale of a girl, an inn, and a world full of levels') + ('https://wanderinginn.com/', 'The Wandering Inn', 'A tale of a girl, an inn, and a world full of levels'), + ('https://mcstories.com/', 'The Erotic Mind-Control Story Archive', 'An archive of erotic mind control stories') on conflict do nothing; diff --git a/util.py b/util.py index 29f54f5..742216f 100644 --- a/util.py +++ b/util.py @@ -353,3 +353,13 @@ def decodeCloudFlareEmail(email: str) -> str: octets = [int(email[i : i + 2], 16) for i in range(0, len(email), 2)] key, ebytes = octets[0], octets[1:] return "".join([chr(o ^ key) for o in ebytes]) + + +def dicts_from_table(bs_table): + headers = [header.text for header in bs_table.find_all('th')] + results = [ + {headers[i]: cell for i, cell in enumerate(row.find_all('td'))} + for row in bs_table.find_all('tr') + ] + results = [r for r in results if r] + return results