Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions adapter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from adapter.fictionPressAdapter import FictionPressAdapter
from adapter.hpFanficArchiveAdapter import HpFanficArchiveAdapter
from adapter.hpffAdapter import HarryPotterFanfictionAdapter
from adapter.mcstoriesAdapter import McStoriesAdapter
from adapter.parahumansAdapter import ParahumansAdapter
from adapter.portkeyArchiveAdapter import PortkeyArchiveAdapter
from adapter.questionableQuestingAdapter import QuestionableQuestingAdapter
Expand Down Expand Up @@ -45,6 +46,7 @@ def registerAdapters() -> None:
adapters[FicType.sufficientvelocity] = SufficientVelocityAdapter()
adapters[FicType.questionablequesting] = QuestionableQuestingAdapter()
adapters[FicType.harrypotterfanfiction] = HarryPotterFanfictionAdapter()
adapters[FicType.mcstories] = McStoriesAdapter()
adapters[FicType.parahumans] = ParahumansAdapter()
adapters[FicType.adultfanfiction] = AdultFanfictionAdapter()
adapters[FicType.fanficsme] = FanficsMeAdapter()
Expand Down
182 changes: 182 additions & 0 deletions adapter/mcstoriesAdapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import re
import urllib.parse
from typing import Optional

from bs4 import BeautifulSoup

import scrape
import util
from adapter.adapter import Adapter, edumpContent
from htypes import FicType, FicId
from schema import OilTimestamp
from store import Fic, Language
from store import FicStatus

story_index_path_re = re.compile(r"/(?P<lid>\w+)/index\.html")
story_chapter_path_re = re.compile(r"/(?P<lid>\w+)/(?P<clid>\w+)\.html")
relative_chapter_path_re = re.compile(r"(?P<clid>\w+)\.html")


def strip_nonnumeric(s):
return ''.join(c for c in s if c.isdigit())


class McStoriesAdapter(Adapter):
def __init__(self) -> None:
super().__init__(
True,
"https://mcstories.com/",
"mcstories.com",
FicType.mcstories,
"mcstories",
)

def tryParseUrl(self, url: str) -> Optional[FicId]:
path = urllib.parse.urlparse(url).path

if m := story_index_path_re.fullmatch(path):
lid = m.group("lid")
elif m := story_chapter_path_re.fullmatch(path):
lid = m.group("clid")
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The clid group is only the same as lid for stories with a single chapter. So far every multi-chapter story I've checked switches to /{lid}/{lid}{cid}.html format.

I haven't caught a chapter that went from one to multiple chapters, but it also seems like /{lid}/{lid}.html gets a 404 at that point -- which maybe isn't a problem but something to be aware of.

else:
return None
return FicId(self.ftype, lid)

def constructUrl(self, fic, chapter=None):
if chapter is None:
path = f"/{fic.localId}/index.html"
else:
path = f"/{fic.localId}/{chapter.localId}.html"
return urllib.parse.urljoin(self.baseUrl, path)

def create(self, fic: Fic) -> Fic:

fic.url = self.constructUrl(fic)

data = scrape.scrape(fic.url)
edumpContent(data["raw"], "mcstories")
fic = self.parseInfoInto(fic, data["raw"])
fic.upsert()

return fic

def absolute_author_link(self, rel_author_link):
if rel_author_link.startswith('..'):
path = rel_author_link[len('..'):]
else:
path = rel_author_link
return urllib.parse.urljoin(self.baseUrl, path)

def absolute_chapter_link(self, fic, rel_chapter_link):
path = f"{fic.localId}/{rel_chapter_link}"
return urllib.parse.urljoin(self.baseUrl, path)

def get_chapter_meta(self, table, fic):
chapters = []
for chapter_row_dict in util.dicts_from_table(table):
name_cell = chapter_row_dict['Chapter']
words_cell = chapter_row_dict['Length']
date_cell = chapter_row_dict['Added']

updated_cell = chapter_row_dict.get('Updated', date_cell)
relative_chapter_path = name_cell.find('a')['href']
clid = relative_chapter_path_re.fullmatch(relative_chapter_path).group('clid')
published = util.parseDateAsUnix(date_cell.string, fic.fetched)

if updated_cell.string and updated_cell.string.strip():
updated = util.parseDateAsUnix(updated_cell.string, fic.fetched)
else:
updated = published
chapters.append({
'clid': clid,
'title': name_cell.string,
'chapter_link': self.absolute_chapter_link(fic, relative_chapter_path),
'words': int(strip_nonnumeric(words_cell.string)),
'published': published,
'updated': updated,
})
return chapters

def parseInfoInto(self, fic: Fic, html: str):
soup = BeautifulSoup(html, "html.parser")

fic.fetched = OilTimestamp.now()
fic.languageId = Language.getId("English") # All stories on mcstories are presumed english
fic.title = soup.find(class_="title").string.strip()
fic.description = soup.find(class_="synopsis").get_text().strip()
fic.ageRating = "M" # *EROTIC* Mind Control Story Archive

date_strings = soup.find_all('h3', class_='dateline')
published_date_string = date_strings.pop(0).string[len("Added "):] # "Added 18 October 2014"
if date_strings:
updated_date_string = date_strings.pop(0).string[len("Updated "):] # "Updated 18 October 2014"
else:
updated_date_string = published_date_string
publishedUts = util.parseDateAsUnix(published_date_string, fic.fetched)
updatedUts = util.parseDateAsUnix(updated_date_string, fic.fetched)
fic.published = OilTimestamp(publishedUts)

chapter_table = soup.find('table', class_='index')
if chapter_table:
chapters = self.get_chapter_meta(chapter_table, fic)
else:
chapter_div = soup.find('div', class_='chapter')
link = chapter_div.find('a')
relative_chapter_path = link['href']
clid = relative_chapter_path_re.fullmatch(relative_chapter_path).group('clid')

chapters = [
{
'clid': clid,
'title': link.string,
'chapter_link': self.absolute_chapter_link(fic, relative_chapter_path),
'words': int(strip_nonnumeric(link.next_sibling)),
'published': publishedUts,
'updated': publishedUts
}]

fic.chapterCount = len(chapters)
fic.wordCount = sum(chapter['words'] for chapter in chapters)

fic.reviewCount = 0
fic.favoriteCount = 0
fic.followCount = 0

# The update date of the last chapter is the best estimate of this story's last update
all_updates = [updatedUts] + [chapter['updated'] for chapter in chapters]
fic.updated = OilTimestamp(max(all_updates))

fic.ficStatus = FicStatus.ongoing # TODO: No indication on this site.

byline = soup.find("h3", class_="byline")
authorLink = self.absolute_author_link(byline.find("a")['href'])

authorUrl = authorLink
author = byline.find("a").string
authorId = author # map pseudo to real?
self.setAuthor(fic, author, authorUrl, authorId)
fic.upsert()

for cid, chapter_meta in enumerate(chapters, 1):
chap = fic.chapter(cid)
chap.url = chapter_meta['chapter_link']
chap.localChapterId = chapter_meta['clid']
chap.title = chapter_meta['title']
chap.upsert()

return fic

# extract the html text which contains the story itself
def extractContent(self, fic: Fic, html: str) -> str:
soup = BeautifulSoup(html, "html.parser")
article = soup.find('article')
if article is None:
edumpContent(html, "mcstories_ec")
raise Exception("unable to find chapters, e-dumped")

return str(article)

def getCurrentInfo(self, fic: Fic) -> Fic:
fic.url = self.constructUrl(fic)
data = scrape.scrape(fic.url)
return self.parseInfoInto(fic, data["raw"])
2 changes: 1 addition & 1 deletion alexandria_api_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ idna==2.10
itsdangerous==2.0.0
Jinja2==3.0.0
MarkupSafe==2.0.0
psycopg2==2.9.9
psycopg2-binary==2.9.9
python-dateutil==2.8.1
requests==2.31.0
six==1.16.0
Expand Down
1 change: 1 addition & 0 deletions htypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class FicType(IntEnum):
fanficparadisesfw = 24
fanficparadisensfw = 25
wanderinginn = 26
mcstories = 27


def adaptFicType(ftype: FicType) -> AsIs:
Expand Down
3 changes: 2 additions & 1 deletion sql/minerva/addSources.sql
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ insert into source(url, name, description) values
('https://thefanfictionforum.net/xenforo/index.php', 'The Fanfiction Forum', 'The Fanfiction Forum'),
('https://www.fanficparadise.com/fpforum-sfw/index.php', 'Fanfiction Paradise SFW', 'Fanfiction Paradise SFW'),
('https://www.fanficparadise.com/fpforum-nsfw/index.php', 'Fanfiction Paradise NSFW', 'Fanfiction Paradise NSFW'),
('https://wanderinginn.com/', 'The Wandering Inn', 'A tale of a girl, an inn, and a world full of levels')
('https://wanderinginn.com/', 'The Wandering Inn', 'A tale of a girl, an inn, and a world full of levels'),
('https://mcstories.com/', 'The Erotic Mind-Control Story Archive', 'An archive of erotic mind control stories')
on conflict do nothing;

10 changes: 10 additions & 0 deletions util.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,3 +353,13 @@ def decodeCloudFlareEmail(email: str) -> str:
octets = [int(email[i : i + 2], 16) for i in range(0, len(email), 2)]
key, ebytes = octets[0], octets[1:]
return "".join([chr(o ^ key) for o in ebytes])


def dicts_from_table(bs_table):
headers = [header.text for header in bs_table.find_all('th')]
results = [
{headers[i]: cell for i, cell in enumerate(row.find_all('td'))}
for row in bs_table.find_all('tr')
]
results = [r for r in results if r]
return results