Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""LICENSE
2Copyright 2015 Hermann Krumrey <hermann@krumreyh.com>
4This file is part of manga-dl.
6manga-dl is free software: you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation, either version 3 of the License, or
9(at your option) any later version.
11manga-dl is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
16You should have received a copy of the GNU General Public License
17along with manga-dl. If not, see <http://www.gnu.org/licenses/>.
18LICENSE"""
20import logging
21from typing import Optional, List, Set, Dict
22from manga_dl.entities.Chapter import Chapter
25class Scraper:
26 """
27 Specifies the Capabilities of a manga download site scraper
28 """
30 def __init__(
31 self,
32 _format: str = "cbz",
33 destination: Optional[str] = None,
34 languages: Optional[Set[str]] = None
35 ):
36 """
37 Initializes the Scraper object
38 :param _format: The format in which to store chapters
39 :param destination: The destination directory in
40 which to store chapters
41 :param languages: Set of languages for which to check
42 """
43 self.logger = logging.getLogger(self.__class__.__name__)
44 self.format = _format
45 self.destination = destination
46 if languages is None:
47 self.languages = {"english", "gb", "us"}
48 else:
49 self.languages = languages
51 @classmethod
52 def name(cls) -> str:
53 """
54 :return: The name of the scraper
55 """
56 raise NotImplementedError()
58 @classmethod
59 def url_matches(cls, url: str) -> bool:
60 """
61 Checks whether or not an URL matches for the scraper
62 :param url: The URL to check
63 :return: Whether the URL is valid
64 """
65 raise NotImplementedError()
67 def generate_url(self, _id: str) -> str:
68 """
69 Generates an URL based on an ID
70 :param _id: The ID to use
71 :return: The generated URL
72 """
73 raise NotImplementedError()
75 def load_chapters(
76 self,
77 url: Optional[str] = None,
78 _id: Optional[str] = None
79 ) -> List[Chapter]:
80 """
81 Loads a list of Chapter objects for an URL or ID
82 Only one of either an URL or an ID is required
83 :param url: The URL
84 :param _id: The ID
85 :return: The list of chapters
86 """
87 if url is None and _id is None:
88 self.logger.warning("Neither URL or ID provided. Can't continue.")
89 return []
90 elif url is not None and not self.url_matches(url):
91 self.logger.warning("Invalid URL. Can't continue.")
92 return []
93 elif _id is not None:
94 url = self.generate_url(_id)
96 chapters = self._load_chapters(str(url))
97 chapters = self._remove_other_languages(chapters)
98 chapters = self._sort_chapters(chapters)
99 chapters = self._deduplicate_chapters(chapters)
100 chapters = self._combine_multipart_chapters(chapters)
102 return chapters
104 @staticmethod
105 def _sort_chapters(chapters: List[Chapter]) -> List[Chapter]:
106 """
107 Sorts a list of chapters. First by their total chapter number,
108 then their macro chapter number
109 :param chapters:
110 :return:
111 """
112 # Both sort steps are necessary!
113 chapters.sort(
114 key=lambda x: str(x.chapter_number).zfill(15)
115 )
116 chapters.sort(
117 key=lambda x: str(x.chapter_number.split(".")[0]).zfill(15)
118 )
119 return chapters
121 def _remove_other_languages(self, chapters: List[Chapter]) \
122 -> List[Chapter]:
123 """
124 Removes unwanted languages from the chapter list
125 :param chapters: The chapter list
126 :return: The chapter list without unwanted language entries
127 """
128 return list(filter(lambda x: x.language in self.languages, chapters))
130 def _combine_multipart_chapters(self, chapters: List[Chapter]) \
131 -> List[Chapter]:
132 """
133 Combines multipart chapters with each other (e.g. 12.1 and 12.2)
134 :param chapters: The list of chapter to work through
135 :return: The new chapter list
136 """
138 if len(chapters) < 2:
139 return chapters
141 last_chapter = chapters.pop(0)
142 combined_chapters = [] # type: List[Chapter]
143 to_combine = [] # type: List[Chapter]
144 diff = 1
146 for chapter in chapters:
148 new_chapter = last_chapter.macro_chapter != chapter.macro_chapter
149 if chapter.micro_chapter == 1 and new_chapter:
150 self.logger.debug("Marking chapter {} as {}".format(
151 chapter.chapter_number, chapter.macro_chapter
152 ))
153 chapter.chapter_number = str(chapter.macro_chapter)
155 if last_chapter.macro_chapter == chapter.macro_chapter:
157 same_chapter = \
158 last_chapter.micro_chapter + diff == chapter.micro_chapter
160 if last_chapter.micro_chapter == 0 \
161 and chapter.micro_chapter == 2:
162 same_chapter = True
163 diff = 2
165 if same_chapter:
166 to_combine.append(chapter)
167 diff += 1
168 continue
170 if len(to_combine) > 0 and last_chapter.micro_chapter in [0, 1]:
171 self._combine_chapters(last_chapter, to_combine)
172 to_combine = []
173 diff = 1
175 combined_chapters.append(last_chapter)
176 combined_chapters += to_combine
177 to_combine = []
178 last_chapter = chapter
180 if len(to_combine) > 0 and last_chapter.micro_chapter in [0, 1]:
181 self._combine_chapters(last_chapter, to_combine)
182 to_combine = []
184 combined_chapters.append(last_chapter)
185 combined_chapters += to_combine
187 return combined_chapters
189 def _combine_chapters(self, chapter: Chapter, to_combine: List[Chapter]):
190 """
191 Adds chapters to a chapter
192 :param chapter: The master chapter
193 :param to_combine: The chapters to add
194 :return: None
195 """
196 combined_numbers = [chapter.chapter_number]
198 chapter.chapter_number = str(chapter.macro_chapter)
199 for extra in to_combine:
200 chapter.add_additional_url(extra.url)
201 combined_numbers.append(extra.chapter_number)
203 self.logger.debug("Combined chapters: {}".format(combined_numbers))
205 def _deduplicate_chapters(self, chapters: List[Chapter]) -> List[Chapter]:
206 """
207 Removes duplicate chapters from a list
208 The chapter to use is based on which scanlation group was most often
209 found in the other chapters
210 :param chapters: The chapters to work through
211 :return: The deduplicated list of chapters
212 """
214 if len(chapters) < 2:
215 return chapters
217 groups = {} # type: Dict[str, int]
218 chapter_map = {} # type: Dict[str, List[Chapter]]
219 for chapter in chapters:
220 if chapter.group not in groups:
221 groups[str(chapter.group)] = 1
222 else:
223 groups[str(chapter.group)] += 1
225 if chapter.chapter_number not in chapter_map:
226 chapter_map[chapter.chapter_number] = []
227 chapter_map[chapter.chapter_number].append(chapter)
229 for chapter_number, elements in chapter_map.items():
230 if len(elements) > 1:
231 best = max(elements, key=lambda x: groups[str(x.group)])
232 chapter_map[chapter_number] = [best]
234 deduplicated = []
235 for chapter in chapters:
237 best_chapter = chapter_map[chapter.chapter_number][0]
239 if best_chapter == chapter:
240 deduplicated.append(chapter)
241 else:
242 self.logger.debug("Discarding duplicate chapter {}"
243 .format(chapter))
245 return deduplicated
247 def _load_chapters(self, url: str) -> List[Chapter]:
248 """
249 Scraper-specific implementation that loads chapters from the website
250 :param url: The URL to scrape
251 :return: The list of chapters found while scraping
252 """
253 raise NotImplementedError()