Coverage for manga_dl/scrapers/Scraper.py: 11%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1"""LICENSE

4This file is part of manga-dl.

6manga-dl is free software: you can redistribute it and/or modify

7it under the terms of the GNU General Public License as published by

8the Free Software Foundation, either version 3 of the License, or

9(at your option) any later version.

11manga-dl is distributed in the hope that it will be useful,

12but WITHOUT ANY WARRANTY; without even the implied warranty of

13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

14GNU General Public License for more details.

16You should have received a copy of the GNU General Public License

17along with manga-dl. If not, see <http://www.gnu.org/licenses/>.

18LICENSE"""

20import logging

21from typing import Optional, List, Set, Dict

22from manga_dl.entities.Chapter import Chapter

25class Scraper:

26 """

27 Specifies the Capabilities of a manga download site scraper

28 """

30 def __init__(

31 self,

32 _format: str = "cbz",

33 destination: Optional[str] = None,

34 languages: Optional[Set[str]] = None

35 ):

36 """

37 Initializes the Scraper object

38 :param _format: The format in which to store chapters

39 :param destination: The destination directory in

40 which to store chapters

41 :param languages: Set of languages for which to check

42 """

43 self.logger = logging.getLogger(self.__class__.__name__)

44 self.format = _format

45 self.destination = destination

46 if languages is None:

47 self.languages = {"english", "gb", "us"}

48 else:

49 self.languages = languages

51 @classmethod

52 def name(cls) -> str:

53 """

54 :return: The name of the scraper

55 """

56 raise NotImplementedError()

58 @classmethod

59 def url_matches(cls, url: str) -> bool:

60 """

61 Checks whether or not an URL matches for the scraper

62 :param url: The URL to check

63 :return: Whether the URL is valid

64 """

65 raise NotImplementedError()

67 def generate_url(self, _id: str) -> str:

68 """

69 Generates an URL based on an ID

70 :param _id: The ID to use

71 :return: The generated URL

72 """

73 raise NotImplementedError()

75 def load_chapters(

76 self,

77 url: Optional[str] = None,

78 _id: Optional[str] = None

79 ) -> List[Chapter]:

80 """

81 Loads a list of Chapter objects for an URL or ID

82 Only one of either an URL or an ID is required

83 :param url: The URL

84 :param _id: The ID

85 :return: The list of chapters

86 """

87 if url is None and _id is None:

88 self.logger.warning("Neither URL or ID provided. Can't continue.")

89 return []

90 elif url is not None and not self.url_matches(url):

91 self.logger.warning("Invalid URL. Can't continue.")

92 return []

93 elif _id is not None:

94 url = self.generate_url(_id)

96 chapters = self._load_chapters(str(url))

97 chapters = self._remove_other_languages(chapters)

98 chapters = self._sort_chapters(chapters)

99 chapters = self._deduplicate_chapters(chapters)

100 chapters = self._combine_multipart_chapters(chapters)

101

102 return chapters

103

104 @staticmethod

105 def _sort_chapters(chapters: List[Chapter]) -> List[Chapter]:

106 """

107 Sorts a list of chapters. First by their total chapter number,

108 then their macro chapter number

109 :param chapters:

110 :return:

111 """

112 # Both sort steps are necessary!

113 chapters.sort(

114 key=lambda x: str(x.chapter_number).zfill(15)

115 )

116 chapters.sort(

117 key=lambda x: str(x.chapter_number.split(".")[0]).zfill(15)

118 )

119 return chapters

120

121 def _remove_other_languages(self, chapters: List[Chapter]) \

122 -> List[Chapter]:

123 """

124 Removes unwanted languages from the chapter list

125 :param chapters: The chapter list

126 :return: The chapter list without unwanted language entries

127 """

128 return list(filter(lambda x: x.language in self.languages, chapters))

129

130 def _combine_multipart_chapters(self, chapters: List[Chapter]) \

131 -> List[Chapter]:

132 """

133 Combines multipart chapters with each other (e.g. 12.1 and 12.2)

134 :param chapters: The list of chapter to work through

135 :return: The new chapter list

136 """

137

138 if len(chapters) < 2:

139 return chapters

140

141 last_chapter = chapters.pop(0)

142 combined_chapters = [] # type: List[Chapter]

143 to_combine = [] # type: List[Chapter]

144 diff = 1

145

146 for chapter in chapters:

147

148 new_chapter = last_chapter.macro_chapter != chapter.macro_chapter

149 if chapter.micro_chapter == 1 and new_chapter:

150 self.logger.debug("Marking chapter {} as {}".format(

151 chapter.chapter_number, chapter.macro_chapter

152 ))

153 chapter.chapter_number = str(chapter.macro_chapter)

154

155 if last_chapter.macro_chapter == chapter.macro_chapter:

156

157 same_chapter = \

158 last_chapter.micro_chapter + diff == chapter.micro_chapter

159

160 if last_chapter.micro_chapter == 0 \

161 and chapter.micro_chapter == 2:

162 same_chapter = True

163 diff = 2

164

165 if same_chapter:

166 to_combine.append(chapter)

167 diff += 1

168 continue

169

170 if len(to_combine) > 0 and last_chapter.micro_chapter in [0, 1]:

171 self._combine_chapters(last_chapter, to_combine)

172 to_combine = []

173 diff = 1

174

175 combined_chapters.append(last_chapter)

176 combined_chapters += to_combine

177 to_combine = []

178 last_chapter = chapter

179

180 if len(to_combine) > 0 and last_chapter.micro_chapter in [0, 1]:

181 self._combine_chapters(last_chapter, to_combine)

182 to_combine = []

183

184 combined_chapters.append(last_chapter)

185 combined_chapters += to_combine

186

187 return combined_chapters

188

189 def _combine_chapters(self, chapter: Chapter, to_combine: List[Chapter]):

190 """

191 Adds chapters to a chapter

192 :param chapter: The master chapter

193 :param to_combine: The chapters to add

194 :return: None

195 """

196 combined_numbers = [chapter.chapter_number]

197

198 chapter.chapter_number = str(chapter.macro_chapter)

199 for extra in to_combine:

200 chapter.add_additional_url(extra.url)

201 combined_numbers.append(extra.chapter_number)

202

203 self.logger.debug("Combined chapters: {}".format(combined_numbers))

204

205 def _deduplicate_chapters(self, chapters: List[Chapter]) -> List[Chapter]:

206 """

207 Removes duplicate chapters from a list

208 The chapter to use is based on which scanlation group was most often

209 found in the other chapters

210 :param chapters: The chapters to work through

211 :return: The deduplicated list of chapters

212 """

213

214 if len(chapters) < 2:

215 return chapters

216

217 groups = {} # type: Dict[str, int]

218 chapter_map = {} # type: Dict[str, List[Chapter]]

219 for chapter in chapters:

220 if chapter.group not in groups:

221 groups[str(chapter.group)] = 1

222 else:

223 groups[str(chapter.group)] += 1

224

225 if chapter.chapter_number not in chapter_map:

226 chapter_map[chapter.chapter_number] = []

227 chapter_map[chapter.chapter_number].append(chapter)

228

229 for chapter_number, elements in chapter_map.items():

230 if len(elements) > 1:

231 best = max(elements, key=lambda x: groups[str(x.group)])

232 chapter_map[chapter_number] = [best]

233

234 deduplicated = []

235 for chapter in chapters:

236

237 best_chapter = chapter_map[chapter.chapter_number][0]

238

239 if best_chapter == chapter:

240 deduplicated.append(chapter)

241 else:

242 self.logger.debug("Discarding duplicate chapter {}"

243 .format(chapter))

244

245 return deduplicated

246

247 def _load_chapters(self, url: str) -> List[Chapter]:

248 """

249 Scraper-specific implementation that loads chapters from the website

250 :param url: The URL to scrape

251 :return: The list of chapters found while scraping

252 """

253 raise NotImplementedError()

Coverage for manga_dl/scrapers/Scraper.py : 11%

108 statements 18 run 90 missing 0 excluded 0 partial