Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""LICENSE 

2Copyright 2015 Hermann Krumrey <hermann@krumreyh.com> 

3 

4This file is part of manga-dl. 

5 

6manga-dl is free software: you can redistribute it and/or modify 

7it under the terms of the GNU General Public License as published by 

8the Free Software Foundation, either version 3 of the License, or 

9(at your option) any later version. 

10 

11manga-dl is distributed in the hope that it will be useful, 

12but WITHOUT ANY WARRANTY; without even the implied warranty of 

13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

14GNU General Public License for more details. 

15 

16You should have received a copy of the GNU General Public License 

17along with manga-dl. If not, see <http://www.gnu.org/licenses/>. 

18LICENSE""" 

19 

20import logging 

21from typing import Optional, List, Set, Dict 

22from manga_dl.entities.Chapter import Chapter 

23 

24 

25class Scraper: 

26 """ 

27 Specifies the Capabilities of a manga download site scraper 

28 """ 

29 

30 def __init__( 

31 self, 

32 _format: str = "cbz", 

33 destination: Optional[str] = None, 

34 languages: Optional[Set[str]] = None 

35 ): 

36 """ 

37 Initializes the Scraper object 

38 :param _format: The format in which to store chapters 

39 :param destination: The destination directory in 

40 which to store chapters 

41 :param languages: Set of languages for which to check 

42 """ 

43 self.logger = logging.getLogger(self.__class__.__name__) 

44 self.format = _format 

45 self.destination = destination 

46 if languages is None: 

47 self.languages = {"english", "gb", "us"} 

48 else: 

49 self.languages = languages 

50 

51 @classmethod 

52 def name(cls) -> str: 

53 """ 

54 :return: The name of the scraper 

55 """ 

56 raise NotImplementedError() 

57 

58 @classmethod 

59 def url_matches(cls, url: str) -> bool: 

60 """ 

61 Checks whether or not an URL matches for the scraper 

62 :param url: The URL to check 

63 :return: Whether the URL is valid 

64 """ 

65 raise NotImplementedError() 

66 

67 def generate_url(self, _id: str) -> str: 

68 """ 

69 Generates an URL based on an ID 

70 :param _id: The ID to use 

71 :return: The generated URL 

72 """ 

73 raise NotImplementedError() 

74 

75 def load_chapters( 

76 self, 

77 url: Optional[str] = None, 

78 _id: Optional[str] = None 

79 ) -> List[Chapter]: 

80 """ 

81 Loads a list of Chapter objects for an URL or ID 

82 Only one of either an URL or an ID is required 

83 :param url: The URL 

84 :param _id: The ID 

85 :return: The list of chapters 

86 """ 

87 if url is None and _id is None: 

88 self.logger.warning("Neither URL or ID provided. Can't continue.") 

89 return [] 

90 elif url is not None and not self.url_matches(url): 

91 self.logger.warning("Invalid URL. Can't continue.") 

92 return [] 

93 elif _id is not None: 

94 url = self.generate_url(_id) 

95 

96 chapters = self._load_chapters(str(url)) 

97 chapters = self._remove_other_languages(chapters) 

98 chapters = self._sort_chapters(chapters) 

99 chapters = self._deduplicate_chapters(chapters) 

100 chapters = self._combine_multipart_chapters(chapters) 

101 

102 return chapters 

103 

104 @staticmethod 

105 def _sort_chapters(chapters: List[Chapter]) -> List[Chapter]: 

106 """ 

107 Sorts a list of chapters. First by their total chapter number, 

108 then their macro chapter number 

109 :param chapters: 

110 :return: 

111 """ 

112 # Both sort steps are necessary! 

113 chapters.sort( 

114 key=lambda x: str(x.chapter_number).zfill(15) 

115 ) 

116 chapters.sort( 

117 key=lambda x: str(x.chapter_number.split(".")[0]).zfill(15) 

118 ) 

119 return chapters 

120 

121 def _remove_other_languages(self, chapters: List[Chapter]) \ 

122 -> List[Chapter]: 

123 """ 

124 Removes unwanted languages from the chapter list 

125 :param chapters: The chapter list 

126 :return: The chapter list without unwanted language entries 

127 """ 

128 return list(filter(lambda x: x.language in self.languages, chapters)) 

129 

130 def _combine_multipart_chapters(self, chapters: List[Chapter]) \ 

131 -> List[Chapter]: 

132 """ 

133 Combines multipart chapters with each other (e.g. 12.1 and 12.2) 

134 :param chapters: The list of chapter to work through 

135 :return: The new chapter list 

136 """ 

137 

138 if len(chapters) < 2: 

139 return chapters 

140 

141 last_chapter = chapters.pop(0) 

142 combined_chapters = [] # type: List[Chapter] 

143 to_combine = [] # type: List[Chapter] 

144 diff = 1 

145 

146 for chapter in chapters: 

147 

148 new_chapter = last_chapter.macro_chapter != chapter.macro_chapter 

149 if chapter.micro_chapter == 1 and new_chapter: 

150 self.logger.debug("Marking chapter {} as {}".format( 

151 chapter.chapter_number, chapter.macro_chapter 

152 )) 

153 chapter.chapter_number = str(chapter.macro_chapter) 

154 

155 if last_chapter.macro_chapter == chapter.macro_chapter: 

156 

157 same_chapter = \ 

158 last_chapter.micro_chapter + diff == chapter.micro_chapter 

159 

160 if last_chapter.micro_chapter == 0 \ 

161 and chapter.micro_chapter == 2: 

162 same_chapter = True 

163 diff = 2 

164 

165 if same_chapter: 

166 to_combine.append(chapter) 

167 diff += 1 

168 continue 

169 

170 if len(to_combine) > 0 and last_chapter.micro_chapter in [0, 1]: 

171 self._combine_chapters(last_chapter, to_combine) 

172 to_combine = [] 

173 diff = 1 

174 

175 combined_chapters.append(last_chapter) 

176 combined_chapters += to_combine 

177 to_combine = [] 

178 last_chapter = chapter 

179 

180 if len(to_combine) > 0 and last_chapter.micro_chapter in [0, 1]: 

181 self._combine_chapters(last_chapter, to_combine) 

182 to_combine = [] 

183 

184 combined_chapters.append(last_chapter) 

185 combined_chapters += to_combine 

186 

187 return combined_chapters 

188 

189 def _combine_chapters(self, chapter: Chapter, to_combine: List[Chapter]): 

190 """ 

191 Adds chapters to a chapter 

192 :param chapter: The master chapter 

193 :param to_combine: The chapters to add 

194 :return: None 

195 """ 

196 combined_numbers = [chapter.chapter_number] 

197 

198 chapter.chapter_number = str(chapter.macro_chapter) 

199 for extra in to_combine: 

200 chapter.add_additional_url(extra.url) 

201 combined_numbers.append(extra.chapter_number) 

202 

203 self.logger.debug("Combined chapters: {}".format(combined_numbers)) 

204 

205 def _deduplicate_chapters(self, chapters: List[Chapter]) -> List[Chapter]: 

206 """ 

207 Removes duplicate chapters from a list 

208 The chapter to use is based on which scanlation group was most often 

209 found in the other chapters 

210 :param chapters: The chapters to work through 

211 :return: The deduplicated list of chapters 

212 """ 

213 

214 if len(chapters) < 2: 

215 return chapters 

216 

217 groups = {} # type: Dict[str, int] 

218 chapter_map = {} # type: Dict[str, List[Chapter]] 

219 for chapter in chapters: 

220 if chapter.group not in groups: 

221 groups[str(chapter.group)] = 1 

222 else: 

223 groups[str(chapter.group)] += 1 

224 

225 if chapter.chapter_number not in chapter_map: 

226 chapter_map[chapter.chapter_number] = [] 

227 chapter_map[chapter.chapter_number].append(chapter) 

228 

229 for chapter_number, elements in chapter_map.items(): 

230 if len(elements) > 1: 

231 best = max(elements, key=lambda x: groups[str(x.group)]) 

232 chapter_map[chapter_number] = [best] 

233 

234 deduplicated = [] 

235 for chapter in chapters: 

236 

237 best_chapter = chapter_map[chapter.chapter_number][0] 

238 

239 if best_chapter == chapter: 

240 deduplicated.append(chapter) 

241 else: 

242 self.logger.debug("Discarding duplicate chapter {}" 

243 .format(chapter)) 

244 

245 return deduplicated 

246 

247 def _load_chapters(self, url: str) -> List[Chapter]: 

248 """ 

249 Scraper-specific implementation that loads chapters from the website 

250 :param url: The URL to scrape 

251 :return: The list of chapters found while scraping 

252 """ 

253 raise NotImplementedError()