Skip to content

Commit b0c59e8

Browse files
committed
added extract version
1 parent 96b10e0 commit b0c59e8

File tree

1 file changed

+28
-11
lines changed

1 file changed

+28
-11
lines changed

src/converters.ts

+28-11
Original file line numberDiff line numberDiff line change
@@ -70,20 +70,27 @@ export function htmlToMarkdown(
7070
return turndownService.turndown(processedHtml);
7171
}
7272

73-
// Adapted from https://github.com/adbar/trafilatura/blob/9338dffe4478f91639952fa3b3105d6c0a461587/trafilatura/xpaths.py
74-
// Changes:
75-
// - Commented out tags
73+
// Adapted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/xpaths.py#L100
74+
// Added:
7675
// - Add contains(@id, "filter") to remove filter menus
76+
// Removed (because user might want to extract them):
77+
// - Commented out tags
78+
// - Commented out author
79+
// - Commented out rating
80+
// - Commented out attachment
81+
// - Commented out timestamp
82+
// - Commented out user-info and user-profile
83+
// - Commented out comment or hidden section
7784
const OVERALL_DISCARD_XPATH = [
7885
// navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts
7986
`.//*[(self::div or self::item or self::list
8087
or self::p or self::section or self::span)][
8188
contains(translate(@id, "F","f"), "footer") or contains(translate(@class, "F","f"), "footer")
8289
or contains(@id, "related") or contains(translate(@class, "R", "r"), "related") or
8390
contains(@id, "viral") or contains(@class, "viral") or
91+
contains(@id, "filter") or
8492
starts-with(@id, "shar") or starts-with(@class, "shar") or
8593
contains(@class, "share-") or
86-
contains(@id, "filter") or
8794
contains(translate(@id, "S", "s"), "share") or
8895
contains(@id, "social") or contains(@class, "social") or contains(@class, "sociable") or
8996
contains(@id, "syndication") or contains(@class, "syndication") or
@@ -93,6 +100,7 @@ const OVERALL_DISCARD_XPATH = [
93100
or contains(@class, "subnav") or
94101
contains(@id, "cookie") or contains(@class, "cookie") or ` +
95102
// `contains(@id, "tags") or contains(@class, "tags") or ` +
103+
`contains(@id, "sidebar") or contains(@class, "sidebar") or ` +
96104
`contains(@id, "banner") or contains(@class, "banner")
97105
or contains(@class, "meta") or
98106
contains(@id, "menu") or contains(@class, "menu") or
@@ -105,7 +113,8 @@ const OVERALL_DISCARD_XPATH = [
105113
`contains(@id, "button") or contains(@class, "button")
106114
or contains(translate(@class, "B", "b"), "byline") or ` +
107115
// contains(@class, "rating") or ` +
108-
// contains(@class, "timestamp") or
116+
`starts-with(@class, "widget") or ` +
117+
// contains(@class, "attachment") or contains(@class, "timestamp") or
109118
// contains(@class, "user-info") or contains(@class, "user-profile") or
110119
`contains(@class, "-ad-") or contains(@class, "-icon")
111120
or contains(@class, "article-infos") or
@@ -118,11 +127,13 @@ const OVERALL_DISCARD_XPATH = [
118127
or contains(@id, "premium-") or contains(@id, "paywall")
119128
or contains(@class, "obfuscated") or contains(@class, "blurred")
120129
or contains(@class, " ad ")
121-
or contains(@class, "next-post")
130+
or contains(@class, "next-post") or contains(@class, "side-stories")
131+
or contains(@class, "related-stories") or contains(@class, "most-popular")
132+
or contains(@class, "mol-factbox") or starts-with(@class, "ZendeskForm")
122133
or contains(@class, "message-container") or contains(@id, "message_container")
123134
or contains(@class, "yin") or contains(@class, "zlylin") or
124135
contains(@class, "xg1") or contains(@id, "bmdh")
125-
or @data-lp-replacement-content]`,
136+
or @data-lp-replacement-content or @data-testid]`,
126137

127138
// comment debris + hidden parts
128139
// `.//*[@class="comments-title" or contains(@class, "comments-title") or
@@ -135,6 +146,9 @@ const OVERALL_DISCARD_XPATH = [
135146
// or contains(@class, "notloaded")]`,
136147
];
137148

149+
// Adapted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/xpaths.py#L179
150+
// Removed:
151+
// - contains(@style, "border")
138152
const PRECISION_DISCARD_XPATH = [
139153
".//header",
140154
`.//*[(self::div or self::item or self::list
@@ -144,6 +158,10 @@ const PRECISION_DISCARD_XPATH = [
144158
]`,
145159
];
146160

161+
// Adatpted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/settings.py#L55
162+
// Removed (because user might want to extract them):
163+
// - form
164+
// - fieldset
147165
const MANUALLY_CLEANED = [
148166
// important
149167
"aside",
@@ -159,7 +177,9 @@ const MANUALLY_CLEANED = [
159177
"applet",
160178
"audio",
161179
"canvas",
180+
"figure",
162181
"map",
182+
"picture",
163183
"svg",
164184
"video",
165185
// secondary
@@ -190,13 +210,10 @@ const MANUALLY_CLEANED = [
190210
"rt",
191211
"rtc",
192212
"select",
213+
"source",
193214
"style",
194215
"track",
195216
"textarea",
196217
"time",
197218
"use",
198-
// images
199-
"figure",
200-
"picture",
201-
"source",
202219
];

0 commit comments

Comments
 (0)