Problem

You are given an array of strings words and a string target.

A string x is called valid if x is a prefix of any string in words.

A prefix of a string is a substring that starts from the beginning of the string and extends to any point within it.

Return the minimum number of valid strings that can be concatenated to form target. If it is not possible to form target, return -1.

https://leetcode.cn/problems/minimum-number-of-valid-strings-to-form-target-i/

Example 1:

Input: words = ["abc","aaaaa","bcdef"], target = "aabcdabc"
Output: 3
Explanation:
The target string can be formed by concatenating:

  • Prefix of length 2 of words[1], i.e. "aa".
  • Prefix of length 3 of words[2], i.e. "bcd".
  • Prefix of length 3 of words[0], i.e. "abc".

Example 2:

Input: words = ["abababab","ab"], target = "ababaababa"
Output: 2
Explanation:
The target string can be formed by concatenating:

  • Prefix of length 5 of words[0], i.e. "ababa".
  • Prefix of length 5 of words[0], i.e. "ababa".

Example 3:

Input: words = ["abcdef"], target = "xyz"
Output: -1

Constraints:

  • 1 <= words.length <= 100
  • 1 <= words[i].length <= 5 * 10³
  • The input is generated such that sum(words[i].length) <= 10⁵.
  • words[i] consists only of lowercase English letters.
  • 1 <= target.length <= 5 * 10³
  • target consists only of lowercase English letters.

Test Cases

1
2
class Solution:
def minValidStrings(self, words: List[str], target: str) -> int:
solution_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import pytest

from solution import Solution
from solution_ac import Solution as SolutionAc


@pytest.mark.parametrize('words, target, expected', [
(["abc","aaaaa","bcdef"], "aabcdabc", 3),
(["abababab","ab"], "ababaababa", 2),
(["abcdef"], "xyz", -1),

(["caacbbbbbcaccbcbcccbcbbbcbcabbcaaacabbcbbccabcac","baccccb","aacabcca"], "aacaacbabb", 5),
(["adaeabcabdcaabbeceeadeaebcdddeadcbceeeadddabdc","a"], "beeea", -1),
])
class Test:
def test_solution(self, words, target, expected):
sol = Solution()
assert sol.minValidStrings(words, target) == expected

def test_solution_ac(self, words, target, expected):
sol = SolutionAc()
assert sol.minValidStrings(words, target) == expected

Thoughts

直观的想法是看以位置 i 开头的子字符串 target[i:],从位置 i 开始最长能够在 words 中匹配到的长度(记为 plenᵢ),可以取任意 1 ≤ p ≤ plenᵢtarget[i:] 分为两段,即 target[i:i+p]target[i+p:],前半段是一个 words 的前缀,递归处理后半段子问题即可。

dp(i) 表示可以拼接出 target[i:] 的最小 valid 字符串个数,那么有:

dp(i)=1+min1pplenidp(i+p)dp(i)=1+\min_{1\le p\le plen_i}dp(i+p)

考虑到边界和递推,如果 dp(i) 是 0,直接记为 -1。另外记初值 dp(n) = 0

最后的结果取 dp(0)

words 构建前缀树(参见 208. Implement Trie (Prefix Tree)139. Word Break),定义一个 match 方法在前缀树中找到能匹配指定字符串的最长前缀,其长度即为 plenᵢ

时间复杂度 O(k m + n²),其中 k 是 words 中单词的平均长度,m 是 words 的长度,n 是 target 的长度。空间复杂度 O(k m + n)

比较慢。

Code

solution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
class Solution:
def minValidStrings(self, words: list[str], target: str) -> int:
trie: dict[str, dict] = {}
n = len(target)

def trie_add(word: str) -> None:
node = trie
for c in word:
if c not in node: node[c] = {}
node = node[c]

def trie_match(left: int) -> int:
"""Returns the maximum length k, where trie.startsWith(target[left:left+k])"""
node = trie
curr = left
while curr < n and target[curr] in node:
node = node[target[curr]]
curr += 1
return curr - left

for word in words:
trie_add(word)

dp = [-1] * (n + 1)
dp[-1] = 0
for left in range(n - 1, -1, -1):
plen = trie_match(left)
for right in range(left + 1, left + plen + 1):
if dp[right] < 0: continue
if not 0 < dp[left] < 1 + dp[right]:
dp[left] = 1 + dp[right]

return dp[0]

Faster - AC 自动机

3213. Construct String with Minimum Cost 中实现了 AC 自动机,利用 AC 自动机,可以用平均 O(km + n) 时间找出 target 中所有能匹配到的单词。

但这里只是需要有前缀,可以不是完全匹配,需要稍微修改一下 problem 3213 中 AC 自动机的 match 方法。match 方法在匹配路径的终点返回构造时记录的所有单词,从而得到所有匹配到的单词。实际上即使没到终点,路径上的任何一个位置,都是一个前缀。所以把路径上的位置都输出即可。新的方法命名为 prefix。

仿照 problem 3213 中正向的动态规划 dp',重新定义本题的 dp:dp(i) 表示可以拼接出子字符串 target[:i] 的最小前缀个数,定义 dp(0) = 0,其他值均初始化为 ∞,最终所求结果为 dp(n)

令 i 从 0 递增到 n - 1。对于当前的 i,如果 dp(i) = ∞,说明 target[:i] 无法由前缀拼接而成,跳过。否则看从位置 i 开始可以匹配到哪些前缀,比如 target[i:j] 是一个前缀,那么 dp(j) = min{dp(j), dp(i) + 1}

注意到对于 0 ≤ i < k < j < n,如果 target[i:k]target[i:j] 都是前缀,那么即使 target[k:j] 是前缀也不需要考虑,因为把 target[k:j] 与左边的 target[i:k] 拼成一个新的前缀,所用的前缀数量更少。而上边基于 AC 自动机 match 方法改造出来的前缀搜索方法 prefix,刚好不会输出 target[k:j](这么巧吗?)。

时间复杂度 O(km + n),空间复杂度 O(km + n)

solution_ac.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from collections import deque
from typing import Generator


class TrieNode:
def __init__(self):
self.children: dict[str, 'TrieNode'] = {}
self.level = 0
self.fail: 'TrieNode' = None


class TrieTree:
def __init__(self):
self._root = TrieNode()

def add(self, word: str) -> None:
node = self._root
for c in word:
if c not in node.children:
node.children[c] = TrieNode()
node.children[c].level = node.level + 1
node = node.children[c]

def build_ac(self) -> None:
"""Builds failure links for ac-machine using BFS."""
queue = deque()
for node in self._root.children.values():
queue.append(node)
node.fail = self._root

# Breadth-first traversal of the trie.
while len(queue):
node = queue.popleft()
for c, child in node.children.items():
queue.append(child)
fail = node.fail

# Find the longest proper suffix that is also a prefix.
while fail and c not in fail.children:
fail = fail.fail

# Set failure link.
child.fail = fail.children[c] if fail else self._root

def prefix(self, target: str) -> Generator[tuple[int, int], None, None]:
"""Finds all matched prefix in target, yields (start, end), where target[start:end] is a prefix.
For 0 <= i < k < j < n, if target[i:k], target[k:j], and target[i:j] are all prefixes, (k, j) will be ignore.
"""
node = self._root
for i, c in enumerate(target):
# Follow failure links until a match is found.
while node and c not in node.children:
node = node.fail

if not node:
node = self._root
continue
else:
yield i - node.level, i + 1

# Move to the next node based on current character.
node = node.children[c]


class Solution:
def minValidStrings(self, words: list[str], target: str) -> int:
trie = TrieTree()
for word in words: trie.add(word)
trie.build_ac()

n = len(target)
dp = [-1] * (n + 1)
dp[0] = 0
for i, j in trie.prefix(target):
if dp[i] < 0: continue
if dp[j] < 0 or dp[j] > dp[i] + 1:
dp[j] = dp[i] + 1

return dp[n]

附:针对 AC 自动机的构建和多模式前缀搜索的 test cases:

solution_ac_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import pytest

from solution_ac import TrieTree


@pytest.mark.parametrize('target, words, expected', [
("hello worldhello", ["hello", "world"], [(0,0+5), (6,6+5), (11,11+5)]),
("abxabcabcaby", ["ab", "abc", "aby"], [(0,0+2), (3,3+3), (6,6+3), (9,9+3)]),
("abcdef", ["abdef","abc","d","def","ef"], [(0,0+3), (3,3+3)]),
("aaaa", ["z","zz","zzz"], []),
("r", ["r"], [(0,1)]),

("aabcdabc", ["abc","aaaaa","bcdef"], [(0,2), (1,1+2,1+3), (2,2+3,2+3), (5,5+3)]),
("ababaababa", ["abababab","ab"], [(0,5), (5,5+5)]),
("xyz", ["abcdef"], []),
("aacaacbabb", ["caacbbbbbcaccbcbcccbcbbbcbcabbcaaacabbcbbccabcac","baccccb","aacabcca"], [(0,4), (2,2+3,2+5), (6,6+2,6+2), (8,8+1), (9,9+1)]),
("beeea", ["adaeabcabdcaabbeceeadeaebcdddeadcbceeeadddabdc","a"], [(4,4+1)]),
])
class Test:
def test_ac(self, target, words, expected):
trie = TrieTree()
for word in words: trie.add(word)
trie.build_ac()
matches = list(trie.prefix(target))
expected = self._expand_pieces(expected)
assert matches == expected

def _expand_pieces(self, pieces):
result = []
# for i, j in pieces:
# result.extend((i, k+1) for k in range(i, j))
for piece in pieces:
if len(piece) == 3:
i, j1, j = piece
else:
i, j = piece
j1 = i + 1
result.extend((i, k) for k in range(j1, j + 1))

return result