Browse Source

Fix URL linkifier grabbing full-width spaces and quotations (#9997)

Fix #9993
Fix #5654
Eugen Rochko 1 week ago
parent
commit
016ad37bc8
No account linked to committer's email address
3 changed files with 55 additions and 5 deletions
  1. 11
    1
      app/lib/formatter.rb
  2. 2
    2
      config/initializers/twitter_regex.rb
  3. 42
    2
      spec/lib/formatter_spec.rb

+ 11
- 1
app/lib/formatter.rb View File

@@ -199,12 +199,22 @@ class Formatter
199 199
     result.flatten.join
200 200
   end
201 201
 
202
+  UNICODE_ESCAPE_BLACKLIST_RE = /\p{Z}|\p{P}/
203
+
202 204
   def utf8_friendly_extractor(text, options = {})
203 205
     old_to_new_index = [0]
204 206
 
205 207
     escaped = text.chars.map do |c|
206
-      output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c
208
+      output = begin
209
+        if c.ord.to_s(16).length > 2 && UNICODE_ESCAPE_BLACKLIST_RE.match(c).nil?
210
+          CGI.escape(c)
211
+        else
212
+          c
213
+        end
214
+      end
215
+
207 216
       old_to_new_index << old_to_new_index.last + output.length
217
+
208 218
       output
209 219
     end.join
210 220
 

+ 2
- 2
config/initializers/twitter_regex.rb View File

@@ -1,7 +1,7 @@
1 1
 module Twitter
2 2
   class Regex
3
-    REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou
4
-    REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
3
+    REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
4
+    REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
5 5
     REGEXEN[:valid_url_balanced_parens] = /
6 6
       \(
7 7
         (?:

+ 42
- 2
spec/lib/formatter_spec.rb View File

@@ -115,6 +115,22 @@ RSpec.describe Formatter do
115 115
       end
116 116
     end
117 117
 
118
+    context 'given a URL in quotation marks' do
119
+      let(:text) { '"https://example.com/"' }
120
+
121
+      it 'does not match the quotation marks' do
122
+        is_expected.to include 'href="https://example.com/"'
123
+      end
124
+    end
125
+
126
+    context 'given a URL in angle brackets' do
127
+      let(:text) { '<https://example.com/>' }
128
+
129
+      it 'does not match the angle brackets' do
130
+        is_expected.to include 'href="https://example.com/"'
131
+      end
132
+    end
133
+
118 134
     context 'given a URL with Japanese path string' do
119 135
       let(:text) { 'https://ja.wikipedia.org/wiki/日本' }
120 136
 
@@ -131,6 +147,22 @@ RSpec.describe Formatter do
131 147
       end
132 148
     end
133 149
 
150
+    context 'given a URL with a full-width space' do
151
+      let(:text) { 'https://example.com/ abc123' }
152
+
153
+      it 'does not match the full-width space' do
154
+        is_expected.to include 'href="https://example.com/"'
155
+      end
156
+    end
157
+
158
+    context 'given a URL in Japanese quotation marks' do
159
+      let(:text) { '「[https://example.org/」' }
160
+
161
+      it 'does not match the quotation marks' do
162
+        is_expected.to include 'href="https://example.org/"'
163
+      end
164
+    end
165
+
134 166
     context 'given a URL with Simplified Chinese path string' do
135 167
       let(:text) { 'https://baike.baidu.com/item/中华人民共和国' }
136 168
 
@@ -150,7 +182,11 @@ RSpec.describe Formatter do
150 182
     context 'given a URL containing unsafe code (XSS attack, visible part)' do
151 183
       let(:text) { %q{http://example.com/b<del>b</del>} }
152 184
 
153
-      it 'escapes the HTML in the URL' do
185
+      it 'does not include the HTML in the URL' do
186
+        is_expected.to include '"http://example.com/b"'
187
+      end
188
+
189
+      it 'escapes the HTML' do
154 190
         is_expected.to include '&lt;del&gt;b&lt;/del&gt;'
155 191
       end
156 192
     end
@@ -158,7 +194,11 @@ RSpec.describe Formatter do
158 194
     context 'given a URL containing unsafe code (XSS attack, invisible part)' do
159 195
       let(:text) { %q{http://example.com/blahblahblahblah/a<script>alert("Hello")</script>} }
160 196
 
161
-      it 'escapes the HTML in the URL' do
197
+      it 'does not include the HTML in the URL' do
198
+        is_expected.to include '"http://example.com/blahblahblahblah/a"'
199
+      end
200
+
201
+      it 'escapes the HTML' do
162 202
         is_expected.to include '&lt;script&gt;alert(&quot;Hello&quot;)&lt;/script&gt;'
163 203
       end
164 204
     end

Loading…
Cancel
Save