replace.go

1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package strings
6
7import (
8	"io"
9	"sync"
10)
11
12// Replacer replaces a list of strings with replacements.
13// It is safe for concurrent use by multiple goroutines.
14type Replacer struct {
15	once   sync.Once // guards buildOnce method
16	r      replacer
17	oldnew []string
18}
19
20// replacer is the interface that a replacement algorithm needs to implement.
21type replacer interface {
22	Replace(s string) string
23	WriteString(w io.Writer, s string) (n int, err error)
24}
25
26// NewReplacer returns a new Replacer from a list of old, new string
27// pairs. Replacements are performed in the order they appear in the
28// target string, without overlapping matches. The old string
29// comparisons are done in argument order.
30//
31// NewReplacer panics if given an odd number of arguments.
32func NewReplacer(oldnew ...string) *Replacer {
33	if len(oldnew)%2 == 1 {
34		panic("strings.NewReplacer: odd argument count")
35	}
36	return &Replacer{oldnew: append([]string(nil), oldnew...)}
37}
38
39func (r *Replacer) buildOnce() {
40	r.r = r.build()
41	r.oldnew = nil
42}
43
44func (b *Replacer) build() replacer {
45	oldnew := b.oldnew
46	if len(oldnew) == 2 && len(oldnew[0]) > 1 {
47		return makeSingleStringReplacer(oldnew[0], oldnew[1])
48	}
49
50	allNewBytes := true
51	for i := 0; i < len(oldnew); i += 2 {
52		if len(oldnew[i]) != 1 {
53			return makeGenericReplacer(oldnew)
54		}
55		if len(oldnew[i+1]) != 1 {
56			allNewBytes = false
57		}
58	}
59
60	if allNewBytes {
61		r := byteReplacer{}
62		for i := range r {
63			r[i] = byte(i)
64		}
65		// The first occurrence of old->new map takes precedence
66		// over the others with the same old string.
67		for i := len(oldnew) - 2; i >= 0; i -= 2 {
68			o := oldnew[i][0]
69			n := oldnew[i+1][0]
70			r[o] = n
71		}
72		return &r
73	}
74
75	r := byteStringReplacer{toReplace: make([]string, 0, len(oldnew)/2)}
76	// The first occurrence of old->new map takes precedence
77	// over the others with the same old string.
78	for i := len(oldnew) - 2; i >= 0; i -= 2 {
79		o := oldnew[i][0]
80		n := oldnew[i+1]
81		// To avoid counting repetitions multiple times.
82		if r.replacements[o] == nil {
83			// We need to use string([]byte{o}) instead of string(o),
84			// to avoid utf8 encoding of o.
85			// E. g. byte(150) produces string of length 2.
86			r.toReplace = append(r.toReplace, string([]byte{o}))
87		}
88		r.replacements[o] = []byte(n)
89
90	}
91	return &r
92}
93
94// Replace returns a copy of s with all replacements performed.
95func (r *Replacer) Replace(s string) string {
96	r.once.Do(r.buildOnce)
97	return r.r.Replace(s)
98}
99
100// WriteString writes s to w with all replacements performed.
101func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) {
102	r.once.Do(r.buildOnce)
103	return r.r.WriteString(w, s)
104}
105
106// trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
107// and values may be empty. For example, the trie containing keys "ax", "ay",
108// "bcbc", "x" and "xy" could have eight nodes:
109//
110//	n0  -
111//	n1  a-
112//	n2  .x+
113//	n3  .y+
114//	n4  b-
115//	n5  .cbc+
116//	n6  x+
117//	n7  .y+
118//
119// n0 is the root node, and its children are n1, n4 and n6; n1's children are
120// n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
121// with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
122// (marked with a trailing "+") are complete keys.
123type trieNode struct {
124	// value is the value of the trie node's key/value pair. It is empty if
125	// this node is not a complete key.
126	value string
127	// priority is the priority (higher is more important) of the trie node's
128	// key/value pair; keys are not necessarily matched shortest- or longest-
129	// first. Priority is positive if this node is a complete key, and zero
130	// otherwise. In the example above, positive/zero priorities are marked
131	// with a trailing "+" or "-".
132	priority int
133
134	// A trie node may have zero, one or more child nodes:
135	//  * if the remaining fields are zero, there are no children.
136	//  * if prefix and next are non-zero, there is one child in next.
137	//  * if table is non-zero, it defines all the children.
138	//
139	// Prefixes are preferred over tables when there is one child, but the
140	// root node always uses a table for lookup efficiency.
141
142	// prefix is the difference in keys between this trie node and the next.
143	// In the example above, node n4 has prefix "cbc" and n4's next node is n5.
144	// Node n5 has no children and so has zero prefix, next and table fields.
145	prefix string
146	next   *trieNode
147
148	// table is a lookup table indexed by the next byte in the key, after
149	// remapping that byte through genericReplacer.mapping to create a dense
150	// index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
151	// 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
152	// genericReplacer.tableSize will be 5. Node n0's table will be
153	// []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
154	// 'a', 'b' and 'x'.
155	table []*trieNode
156}
157
158func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
159	if key == "" {
160		if t.priority == 0 {
161			t.value = val
162			t.priority = priority
163		}
164		return
165	}
166
167	if t.prefix != "" {
168		// Need to split the prefix among multiple nodes.
169		var n int // length of the longest common prefix
170		for ; n < len(t.prefix) && n < len(key); n++ {
171			if t.prefix[n] != key[n] {
172				break
173			}
174		}
175		if n == len(t.prefix) {
176			t.next.add(key[n:], val, priority, r)
177		} else if n == 0 {
178			// First byte differs, start a new lookup table here. Looking up
179			// what is currently t.prefix[0] will lead to prefixNode, and
180			// looking up key[0] will lead to keyNode.
181			var prefixNode *trieNode
182			if len(t.prefix) == 1 {
183				prefixNode = t.next
184			} else {
185				prefixNode = &trieNode{
186					prefix: t.prefix[1:],
187					next:   t.next,
188				}
189			}
190			keyNode := new(trieNode)
191			t.table = make([]*trieNode, r.tableSize)
192			t.table[r.mapping[t.prefix[0]]] = prefixNode
193			t.table[r.mapping[key[0]]] = keyNode
194			t.prefix = ""
195			t.next = nil
196			keyNode.add(key[1:], val, priority, r)
197		} else {
198			// Insert new node after the common section of the prefix.
199			next := &trieNode{
200				prefix: t.prefix[n:],
201				next:   t.next,
202			}
203			t.prefix = t.prefix[:n]
204			t.next = next
205			next.add(key[n:], val, priority, r)
206		}
207	} else if t.table != nil {
208		// Insert into existing table.
209		m := r.mapping[key[0]]
210		if t.table[m] == nil {
211			t.table[m] = new(trieNode)
212		}
213		t.table[m].add(key[1:], val, priority, r)
214	} else {
215		t.prefix = key
216		t.next = new(trieNode)
217		t.next.add("", val, priority, r)
218	}
219}
220
221func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
222	// Iterate down the trie to the end, and grab the value and keylen with
223	// the highest priority.
224	bestPriority := 0
225	node := &r.root
226	n := 0
227	for node != nil {
228		if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
229			bestPriority = node.priority
230			val = node.value
231			keylen = n
232			found = true
233		}
234
235		if s == "" {
236			break
237		}
238		if node.table != nil {
239			index := r.mapping[s[0]]
240			if int(index) == r.tableSize {
241				break
242			}
243			node = node.table[index]
244			s = s[1:]
245			n++
246		} else if node.prefix != "" && HasPrefix(s, node.prefix) {
247			n += len(node.prefix)
248			s = s[len(node.prefix):]
249			node = node.next
250		} else {
251			break
252		}
253	}
254	return
255}
256
257// genericReplacer is the fully generic algorithm.
258// It's used as a fallback when nothing faster can be used.
259type genericReplacer struct {
260	root trieNode
261	// tableSize is the size of a trie node's lookup table. It is the number
262	// of unique key bytes.
263	tableSize int
264	// mapping maps from key bytes to a dense index for trieNode.table.
265	mapping [256]byte
266}
267
268func makeGenericReplacer(oldnew []string) *genericReplacer {
269	r := new(genericReplacer)
270	// Find each byte used, then assign them each an index.
271	for i := 0; i < len(oldnew); i += 2 {
272		key := oldnew[i]
273		for j := 0; j < len(key); j++ {
274			r.mapping[key[j]] = 1
275		}
276	}
277
278	for _, b := range r.mapping {
279		r.tableSize += int(b)
280	}
281
282	var index byte
283	for i, b := range r.mapping {
284		if b == 0 {
285			r.mapping[i] = byte(r.tableSize)
286		} else {
287			r.mapping[i] = index
288			index++
289		}
290	}
291	// Ensure root node uses a lookup table (for performance).
292	r.root.table = make([]*trieNode, r.tableSize)
293
294	for i := 0; i < len(oldnew); i += 2 {
295		r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
296	}
297	return r
298}
299
300type appendSliceWriter []byte
301
302// Write writes to the buffer to satisfy io.Writer.
303func (w *appendSliceWriter) Write(p []byte) (int, error) {
304	*w = append(*w, p...)
305	return len(p), nil
306}
307
308// WriteString writes to the buffer without string->[]byte->string allocations.
309func (w *appendSliceWriter) WriteString(s string) (int, error) {
310	*w = append(*w, s...)
311	return len(s), nil
312}
313
314type stringWriter struct {
315	w io.Writer
316}
317
318func (w stringWriter) WriteString(s string) (int, error) {
319	return w.w.Write([]byte(s))
320}
321
322func getStringWriter(w io.Writer) io.StringWriter {
323	sw, ok := w.(io.StringWriter)
324	if !ok {
325		sw = stringWriter{w}
326	}
327	return sw
328}
329
330func (r *genericReplacer) Replace(s string) string {
331	buf := make(appendSliceWriter, 0, len(s))
332	r.WriteString(&buf, s)
333	return string(buf)
334}
335
336func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
337	sw := getStringWriter(w)
338	var last, wn int
339	var prevMatchEmpty bool
340	for i := 0; i <= len(s); {
341		// Fast path: s[i] is not a prefix of any pattern.
342		if i != len(s) && r.root.priority == 0 {
343			index := int(r.mapping[s[i]])
344			if index == r.tableSize || r.root.table[index] == nil {
345				i++
346				continue
347			}
348		}
349
350		// Ignore the empty match iff the previous loop found the empty match.
351		val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
352		prevMatchEmpty = match && keylen == 0
353		if match {
354			wn, err = sw.WriteString(s[last:i])
355			n += wn
356			if err != nil {
357				return
358			}
359			wn, err = sw.WriteString(val)
360			n += wn
361			if err != nil {
362				return
363			}
364			i += keylen
365			last = i
366			continue
367		}
368		i++
369	}
370	if last != len(s) {
371		wn, err = sw.WriteString(s[last:])
372		n += wn
373	}
374	return
375}
376
377// singleStringReplacer is the implementation that's used when there is only
378// one string to replace (and that string has more than one byte).
379type singleStringReplacer struct {
380	finder *stringFinder
381	// value is the new string that replaces that pattern when it's found.
382	value string
383}
384
385func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer {
386	return &singleStringReplacer{finder: makeStringFinder(pattern), value: value}
387}
388
389func (r *singleStringReplacer) Replace(s string) string {
390	var buf Builder
391	i, matched := 0, false
392	for {
393		match := r.finder.next(s[i:])
394		if match == -1 {
395			break
396		}
397		matched = true
398		buf.Grow(match + len(r.value))
399		buf.WriteString(s[i : i+match])
400		buf.WriteString(r.value)
401		i += match + len(r.finder.pattern)
402	}
403	if !matched {
404		return s
405	}
406	buf.WriteString(s[i:])
407	return buf.String()
408}
409
410func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
411	sw := getStringWriter(w)
412	var i, wn int
413	for {
414		match := r.finder.next(s[i:])
415		if match == -1 {
416			break
417		}
418		wn, err = sw.WriteString(s[i : i+match])
419		n += wn
420		if err != nil {
421			return
422		}
423		wn, err = sw.WriteString(r.value)
424		n += wn
425		if err != nil {
426			return
427		}
428		i += match + len(r.finder.pattern)
429	}
430	wn, err = sw.WriteString(s[i:])
431	n += wn
432	return
433}
434
435// byteReplacer is the implementation that's used when all the "old"
436// and "new" values are single ASCII bytes.
437// The array contains replacement bytes indexed by old byte.
438type byteReplacer [256]byte
439
440func (r *byteReplacer) Replace(s string) string {
441	var buf []byte // lazily allocated
442	for i := 0; i < len(s); i++ {
443		b := s[i]
444		if r[b] != b {
445			if buf == nil {
446				buf = []byte(s)
447			}
448			buf[i] = r[b]
449		}
450	}
451	if buf == nil {
452		return s
453	}
454	return string(buf)
455}
456
457func (r *byteReplacer) WriteString(w io.Writer, s string) (n int, err error) {
458	// TODO(bradfitz): use io.WriteString with slices of s, avoiding allocation.
459	bufsize := 32 << 10
460	if len(s) < bufsize {
461		bufsize = len(s)
462	}
463	buf := make([]byte, bufsize)
464
465	for len(s) > 0 {
466		ncopy := copy(buf, s)
467		s = s[ncopy:]
468		for i, b := range buf[:ncopy] {
469			buf[i] = r[b]
470		}
471		wn, err := w.Write(buf[:ncopy])
472		n += wn
473		if err != nil {
474			return n, err
475		}
476	}
477	return n, nil
478}
479
480// byteStringReplacer is the implementation that's used when all the
481// "old" values are single ASCII bytes but the "new" values vary in size.
482type byteStringReplacer struct {
483	// replacements contains replacement byte slices indexed by old byte.
484	// A nil []byte means that the old byte should not be replaced.
485	replacements [256][]byte
486	// toReplace keeps a list of bytes to replace. Depending on length of toReplace
487	// and length of target string it may be faster to use Count, or a plain loop.
488	// We store single byte as a string, because Count takes a string.
489	toReplace []string
490}
491
492// countCutOff controls the ratio of a string length to a number of replacements
493// at which (*byteStringReplacer).Replace switches algorithms.
494// For strings with higher ration of length to replacements than that value,
495// we call Count, for each replacement from toReplace.
496// For strings, with a lower ratio we use simple loop, because of Count overhead.
497// countCutOff is an empirically determined overhead multiplier.
498// TODO(tocarip) revisit once we have register-based abi/mid-stack inlining.
499const countCutOff = 8
500
501func (r *byteStringReplacer) Replace(s string) string {
502	newSize := len(s)
503	anyChanges := false
504	// Is it faster to use Count?
505	if len(r.toReplace)*countCutOff <= len(s) {
506		for _, x := range r.toReplace {
507			if c := Count(s, x); c != 0 {
508				// The -1 is because we are replacing 1 byte with len(replacements[b]) bytes.
509				newSize += c * (len(r.replacements[x[0]]) - 1)
510				anyChanges = true
511			}
512
513		}
514	} else {
515		for i := 0; i < len(s); i++ {
516			b := s[i]
517			if r.replacements[b] != nil {
518				// See above for explanation of -1
519				newSize += len(r.replacements[b]) - 1
520				anyChanges = true
521			}
522		}
523	}
524	if !anyChanges {
525		return s
526	}
527	buf := make([]byte, newSize)
528	j := 0
529	for i := 0; i < len(s); i++ {
530		b := s[i]
531		if r.replacements[b] != nil {
532			j += copy(buf[j:], r.replacements[b])
533		} else {
534			buf[j] = b
535			j++
536		}
537	}
538	return string(buf)
539}
540
541func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
542	sw := getStringWriter(w)
543	last := 0
544	for i := 0; i < len(s); i++ {
545		b := s[i]
546		if r.replacements[b] == nil {
547			continue
548		}
549		if last != i {
550			nw, err := sw.WriteString(s[last:i])
551			n += nw
552			if err != nil {
553				return n, err
554			}
555		}
556		last = i + 1
557		nw, err := w.Write(r.replacements[b])
558		n += nw
559		if err != nil {
560			return n, err
561		}
562	}
563	if last != len(s) {
564		var nw int
565		nw, err = sw.WriteString(s[last:])
566		n += nw
567	}
568	return
569}
replace.go#

replace.go