开发者

Finding if a string is an iterative substring Algorithm in C?

Problem Description

I have a string S. How can I find if the string follows S = nT ?

E开发者_Go百科xamples:

Function should return true if

1) S = "abab"  
2) S = "abcdabcd"  
3) S = "abcabcabc"  
4) S = "zzxzzxzzx"  

But if S="abcb" returns false.

I though maybe we can repeatedly call KMP on substrings of S and then decide.

i.e.:

for "abab": call on KMP on "a". it returns 2(two instances). now 2*len("a")!=len(s)

call on KMP on "ab". it returns 2. now 2*len("ab")==len(s) so return true

Can you suggest any better algorithms?


I can think of a heuristic, only call KMP on a sub string if Len(original string)/Len of(sub string) is a positive integer.

Also the maximum length of the sub string has to be less than N/2.

EDIT

Using these Heuristics Iwrote the follwing python code because my C is rusty at the moment

oldstr='ABCDABCD'    

for i in xrange(0,len(oldstr)/2):
       newslice=oldstr[0:i+1]
         if newslice*(len(oldstr)/len(newslice)) == oldstr:
             print 'pattern found', newslice
             break


You actually only need to care about testing substring lengths that are equal to the full string length divided by a prime number. The reason is: If S contains n copies of T, and n is not prime, then n = ab, and so S actually also contains a copies of bT (where "bT" means "T repeated b times"). This is an extension of anijhaw's answer.

int primes[] = { 2, 3, 5, 7, 11, 13, 17 };  /* There are one or two more... ;) */
int nPrimes = sizeof primes / sizeof primes[0];

/* Passing in the string length instead of assuming ASCIIZ strings means we
 * don't have to modify the string in-place or allocate memory for new copies
 * to handle recursion. */
int is_iterative(char *s, int len) {
    int i, j;
    for (i = 0; i < nPrimes && primes[i] < len; ++i) {
        if (len % primes[i] == 0) {
            int sublen = len / primes[i];
            /* Is it possible that s consists of repeats of length sublen? */
            for (j = sublen; j < len; j += sublen) {
                if (memcmp(s, s + j, sublen)) {
                    break;
                }
            }

            if (j == len) {
                /* All length-sublen substrings are equal.  We could stop here
                 * (meaning e.g. "abababab" will report a correct, but
                 * non-minimal repeated substring of length 4), but let's
                 * recurse to see if an even shorter repeated substring
                 * can be found. */
                return is_iterative(s, sublen);
            }
        }
    }

    return len;     /* Could not be broken into shorter, repeated substrings */
}

Notice that when recursing to find even shorter repeated substrings, we don't need to check the entire string again, just the first larger repeat -- since we've already established that the remaining large repeats are, well, repeats of the first one. :)


I don't see that the KMP algorithm helps in this case. It is not a matter of determining where to begin the next match. It seems that one way to reduce the search time is to start with the longest possibility (half the length) and work downward. The only lengths that neeed to be tested are lengths that evenly divide into the total length. Here is an example in Ruby. I should add that I realize the question was tagged as C, but this was just a simple way to show the algorithm I was thinking about (and allowed me to test that it worked).

class String
def IsPattern( )
    len = self.length
    testlen = len / 2
    # the fastest is to start with two entries and work down
    while ( testlen > 0 )
        # if this is not an even divisor then it can't fit the pattern
        if ( len % testlen == 0 )
            # evenly divides, so it may match
            if ( self == self[0..testlen-1] * ( len / testlen ))
                return true
            end

        end
        testlen = testlen - 1
    end
    # must not have matched
    false
end
end

if __FILE__ == $0

   ARGV.each do |str|
       puts "%s, %s" % [str, str.IsPattern ? "true" : "false" ]
   end

end



[C:\test]ruby patterntest.rb a aa abab abcdabcd abcabcabc zzxzzxzzx abcd
a, false
aa, true
abab, true
abcdabcd, true
abcabcabc, true
zzxzzxzzx, true
abcd, false


I suppose you could try the following algorithm:

Lets L to be a possible substring length which generates the original word. For L from 1 to strlen(s)/2 check if the first character acquires in all L*i positions for i from 1 to strlen(s)/L. If it does then it could be a possible solution and you should check it with memcmp, if not try the next L. Of course you can skip some L values which are not dividing strlen(s).


Try this:

    char s[] = "abcabcabcabc";
int nStringLength = strlen (s);
int nMaxCheckLength = nStringLength / 2;
int nThisOffset;
int nNumberOfSubStrings;
char cMustMatch;
char cCompare;
BOOL bThisSubStringLengthRepeats;
// Check all sub string lengths up to half the total length
for (int nSubStringLength = 1;  nSubStringLength <= nMaxCheckLength;  nSubStringLength++)
{
    // How many substrings will there be?
    nNumberOfSubStrings = nStringLength / nSubStringLength;

    // Only check substrings that fit exactly
    if (nSubStringLength * nNumberOfSubStrings == nStringLength)
    {
        // Assume it's going to be ok
        bThisSubStringLengthRepeats = TRUE;

        // check each character in substring
        for (nThisOffset = 0;  nThisOffset < nSubStringLength;  nThisOffset++)
        {
            // What must it be?
            cMustMatch = s [nThisOffset];

            // check each substring's char in that position
            for (int nSubString = 1;  nSubString < nNumberOfSubStrings;  nSubString++)
            {
                cCompare = s [(nSubString * nSubStringLength) + nThisOffset];
                // Don't bother checking more if this doesn't match
                if (cCompare != cMustMatch)
                {
                    bThisSubStringLengthRepeats = FALSE;
                    break;
                }
            }

            // Stop checking this substring
            if (!bThisSubStringLengthRepeats)
            {
                break;
            }
        }

        // We have found a match!
        if (bThisSubStringLengthRepeats)
        {
            return TRUE;
        }
    }
}

// We went through the whole lot, but no matches found
return FALSE;


This is Java code but you should get the idea:

        String str = "ababcababc";
    int repPos = 0;
    int repLen = 0;
    for( int i = 0; i < str.length(); i++ ) {
        if( repLen == 0 ) {
            repLen = 1;
        } else {
            char c = str.charAt( i );
            if( c == str.charAt( repPos ) ) {
                repPos = ++repPos % repLen;
            } else {
                repLen = i+1;
            }
        }
    }

This will return the length of the shortest repeating chunk or the length of the string if there's no repetition.


You can build the suffix array of the string, sort it.
Now look for series of ever doubling suffixes and when you've reached one that's the size of the entire string (S) the first in the series will give you T.

For example:

abcd <-- T
abcdabcd <-- S
bcd
bcdabcd
cd
cdabcd
d
dabcd

x
xzzx
xzzxzzx
zx
zxzzx
zxzzxzzx
zzx <-- T
zzxzzx
zzxzzxzzx <-- S

a
apa
apapa
apapapa
pa <-- T
papa
papapa <-- Another T, not detected by this algo
papapapa <-- S


The brute force approach would be to pick all possible substrings and see if they can form the entire string.

We can do one better using the observation that for a substring to be a valid candidate len(str) % len(substr) == 0. This can be deduced from the problem statement.

Here is the full code:

bool isRational(const string &str){
    int len = str.length();
    const auto &factors = getFactors(len); // this would include 1 but exclude len
    // sort(factors.begin(), factors.end()); To get out of the loop faster. Why? See https://stackoverflow.com/a/4698155/1043773
    for(auto iter = factors.rbegin(); iter != factors.rend(); ++iter){
        auto factor = *iter;
        bool result = true;
        for(int i = 0; i < factor && result; ++i){
            for(int j = i + factor; j < len; j += factor, ++cntr){
                if (str[i] != str[j]) { result = false; break; }
            }
        }

        if (result) { return true;}
    }
    return false;
}

Notice that there is a faster variation in terms of time complexity which uses KMP.

The above algorithm is O(N * factorCount(N)) But the good thing about this algorithm is it can bail out much faster than the KMP algorithm. Also the number of factors do not grow much.

Here is the graph of [i, factorCount(i)] for i <= 10^6

Finding if a string is an iterative substring Algorithm in C?

Here is how the algorithm performs as against the KMP algorithm. Red graph is O(N * factorCount(N)) and Blue is O(N) KMP

The KMP code is picked from here

Finding if a string is an iterative substring Algorithm in C?


Here is a simple and effective solution in C as requested:

  • it tests all factors of the length of the string in sqrt(len) iterations.
  • it uses a single call to memcmp to assert the predicate for each potential pattern length.
  • it returns the minimum pattern length.
#include <string.h>

// check if string s is a repeated pattern.
// return the pattern length or 0.
size_t is_iterative(const char *s) {
    size_t i, j, res = 0, len = strlen(s);
    if (len > 1 && !memcmp(s, s + 1, len - 1))
        return 1;
    for (i = 2; i <= (j = len / i); i++) {
        if (len % i == 0) {
            if (!memcmp(s, s + i, len - i))
                return i;
            if (!memcmp(s, s + j, len - j)) {
                res = len = j;
                i--;  // try the same divisor again
            }
        }
    }
    return res;
}

An much faster approach would implement these steps:

  • initialize an array a of UCHAR_MAX+1 integers to 0.
  • scan the string, counting all byte values into a.
  • compute the gcd of all non-zero byte counts and the string length.
  • if the gcd is 1, the string is not iterative.
  • otherwise the repeat count must be a divisor of the gcd.
  • only test the corresponding pattern lengths with the above method.

Here is the code:

#include <limits.h>
#include <string.h>

size_t gcd_size(size_t a, size_t b) {
    while (b != 0) {
        size_t t = b;
        b = a % b;
        a = t;
    }
    return a;
}

// check if string s is a repeated pattern.
// return the pattern length or 0.
size_t is_iterative(const char *s) {
    size_t a[UCHAR_MAX+1] = { 0 };
    size_t i, j, plen, rep, len, res = 0;
    for (i = 0; s[i]; i++) {
        a[(unsigned char)s[i]]++;
    }
    rep = len = i;
    if (rep <= 1)
        return 0;
    for (i = 0; i <= UCHAR_MAX; i++) {
        if (a[i]) {
            rep = gcd_size(a[i], rep);
            if (rep == 1)
                return 0;
        }
    }
    plen = len / rep;
    if (!memcmp(s, s + plen, len - plen))
        return plen;
    for (i = 2; i <= (j = rep / i); i++) {
        if (rep % i == 0) {
            plen = len / j;
            if (!memcmp(s, s + plen, len - plen))
                return plen;
            plen = len / i;
            if (!memcmp(s, s + plen, len - plen)) {
                res = len = plen;
                rep = j;
                i--;  // try the same divisor again
            }
        }
    }
    return res;
}


#include <bits/stdc++.h>
using namespace std;
int main()
{
    bool check(string, string);
    string str="abcabcabc";
    string str2="abcabcabcabcabc";
    if(str2.size()<str.size()) swap(str,str2);

    for(int i=1;i<=str.size()/2;i++)
    {
        if(check(str.substr(0,i), str))
        {
            if(check(str.substr(0,i), str2))
            {
                cout<<str.substr(0,i);
                return 1;
            }
        }
    }
    cout<<0;
}

bool check(string substring, string str)
{
    int len=substring.size();
    for(int i=0;i<str.size();)
    {
        if(str.substr(i,substring.size())==substring)
        {
            i=i+substring.size();
        }
        else
            return false;
    }
    return true;
}
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜