@@ -70,20 +70,27 @@ export function htmlToMarkdown(
70
70
return turndownService . turndown ( processedHtml ) ;
71
71
}
72
72
73
- // Adapted from https://github.com/adbar/trafilatura/blob/9338dffe4478f91639952fa3b3105d6c0a461587/trafilatura/xpaths.py
74
- // Changes:
75
- // - Commented out tags
73
+ // Adapted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/xpaths.py#L100
74
+ // Added:
76
75
// - Add contains(@id, "filter") to remove filter menus
76
+ // Removed (because user might want to extract them):
77
+ // - Commented out tags
78
+ // - Commented out author
79
+ // - Commented out rating
80
+ // - Commented out attachment
81
+ // - Commented out timestamp
82
+ // - Commented out user-info and user-profile
83
+ // - Commented out comment or hidden section
77
84
const OVERALL_DISCARD_XPATH = [
78
85
// navigation + footers, news outlets related posts, sharing, jp-post-flair jp-relatedposts
79
86
`.//*[(self::div or self::item or self::list
80
87
or self::p or self::section or self::span)][
81
88
contains(translate(@id, "F","f"), "footer") or contains(translate(@class, "F","f"), "footer")
82
89
or contains(@id, "related") or contains(translate(@class, "R", "r"), "related") or
83
90
contains(@id, "viral") or contains(@class, "viral") or
91
+ contains(@id, "filter") or
84
92
starts-with(@id, "shar") or starts-with(@class, "shar") or
85
93
contains(@class, "share-") or
86
- contains(@id, "filter") or
87
94
contains(translate(@id, "S", "s"), "share") or
88
95
contains(@id, "social") or contains(@class, "social") or contains(@class, "sociable") or
89
96
contains(@id, "syndication") or contains(@class, "syndication") or
@@ -93,6 +100,7 @@ const OVERALL_DISCARD_XPATH = [
93
100
or contains(@class, "subnav") or
94
101
contains(@id, "cookie") or contains(@class, "cookie") or ` +
95
102
// `contains(@id, "tags") or contains(@class, "tags") or ` +
103
+ `contains(@id, "sidebar") or contains(@class, "sidebar") or ` +
96
104
`contains(@id, "banner") or contains(@class, "banner")
97
105
or contains(@class, "meta") or
98
106
contains(@id, "menu") or contains(@class, "menu") or
@@ -105,7 +113,8 @@ const OVERALL_DISCARD_XPATH = [
105
113
`contains(@id, "button") or contains(@class, "button")
106
114
or contains(translate(@class, "B", "b"), "byline") or ` +
107
115
// contains(@class, "rating") or ` +
108
- // contains(@class, "timestamp") or
116
+ `starts-with(@class, "widget") or ` +
117
+ // contains(@class, "attachment") or contains(@class, "timestamp") or
109
118
// contains(@class, "user-info") or contains(@class, "user-profile") or
110
119
`contains(@class, "-ad-") or contains(@class, "-icon")
111
120
or contains(@class, "article-infos") or
@@ -118,11 +127,13 @@ const OVERALL_DISCARD_XPATH = [
118
127
or contains(@id, "premium-") or contains(@id, "paywall")
119
128
or contains(@class, "obfuscated") or contains(@class, "blurred")
120
129
or contains(@class, " ad ")
121
- or contains(@class, "next-post")
130
+ or contains(@class, "next-post") or contains(@class, "side-stories")
131
+ or contains(@class, "related-stories") or contains(@class, "most-popular")
132
+ or contains(@class, "mol-factbox") or starts-with(@class, "ZendeskForm")
122
133
or contains(@class, "message-container") or contains(@id, "message_container")
123
134
or contains(@class, "yin") or contains(@class, "zlylin") or
124
135
contains(@class, "xg1") or contains(@id, "bmdh")
125
- or @data-lp-replacement-content]` ,
136
+ or @data-lp-replacement-content or @data-testid ]` ,
126
137
127
138
// comment debris + hidden parts
128
139
// `.//*[@class="comments-title" or contains(@class, "comments-title") or
@@ -135,6 +146,9 @@ const OVERALL_DISCARD_XPATH = [
135
146
// or contains(@class, "notloaded")]`,
136
147
] ;
137
148
149
+ // Adapted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/xpaths.py#L179
150
+ // Removed:
151
+ // - contains(@style, "border")
138
152
const PRECISION_DISCARD_XPATH = [
139
153
".//header" ,
140
154
`.//*[(self::div or self::item or self::list
@@ -144,6 +158,10 @@ const PRECISION_DISCARD_XPATH = [
144
158
]` ,
145
159
] ;
146
160
161
+ // Adatpted from https://github.com/adbar/trafilatura/blob/c7e00f3a31e436c7b6ce666b44712e16e30908c0/trafilatura/settings.py#L55
162
+ // Removed (because user might want to extract them):
163
+ // - form
164
+ // - fieldset
147
165
const MANUALLY_CLEANED = [
148
166
// important
149
167
"aside" ,
@@ -159,7 +177,9 @@ const MANUALLY_CLEANED = [
159
177
"applet" ,
160
178
"audio" ,
161
179
"canvas" ,
180
+ "figure" ,
162
181
"map" ,
182
+ "picture" ,
163
183
"svg" ,
164
184
"video" ,
165
185
// secondary
@@ -190,13 +210,10 @@ const MANUALLY_CLEANED = [
190
210
"rt" ,
191
211
"rtc" ,
192
212
"select" ,
213
+ "source" ,
193
214
"style" ,
194
215
"track" ,
195
216
"textarea" ,
196
217
"time" ,
197
218
"use" ,
198
- // images
199
- "figure" ,
200
- "picture" ,
201
- "source" ,
202
219
] ;
0 commit comments