""" Bios Analyzer Author: Lazar Kovacevic (2006) http://www.inverudio.com This application was written in Python using wx library for GUI interface and numpy library for fast numerical calculations. Python and these libraries are open source. For their licenses, see license files and read more on the internet. From the BiosAnalyzer file that I wrote (this application that uses above mentioned libraries), following source code for main methods is available: """ from math import * from numpy import * from random import * # SLOW but easier to read and understand code is commented ## Each method is followed by a brief explanation of the output. #----------------------------------------------------------------------------- # Calculating contiguity of the series #----------------------------------------------------------------------------- def contiguity(rawSeries): s = array(rawSeries).copy() smax = s.max(); smin = s.min(); srange = smax - smin difMean = abs(s[1:]-s[:-1]).mean() return difMean / srange ## EXAMPLE: ## rawSeries = [2,4,6,3,9,1] ## difMean = Average(2,2,3,6,8) = 4.2 ## contiguity = 4.2 / (9-1) = 0.525 ## Contiguous (continuous in a descreet way) series have contiguity close to zero. ## This means that each point in the time series is in a close proximity (relative to ## a range of the series) of the previous point. #----------------------------------------------------------------------------- # Embedding the series #----------------------------------------------------------------------------- """ Input: rawSeries is a list of numbers embedding is a vector length embeddSeries() calculates Euclidean norms of all subseries of length = embedding """ def embeddSeries(rawSeries, embedding): t = array(rawSeries).copy(); t *= t; temp = t.copy() for i in range(embedding - 1): t[:-(i+1)] += temp[i+1:] return sqrt(t[:-(i+1)]) ## EXAMPLE: ## result = embeddSeries(rawSeries = [2,4,6,3,9,1], embedding = 3) ## result = [sqrt(2*2+4*4+6*6),sqrt(4*4+6*6+3*3),sqrt(6*6+3*3+9*9),sqrt(3*3+9*9+1*1)] ## result = [7.483,7.810,11.225,9.539] #----------------------------------------------------------------------------- # Calculating percent isometry for the (embedded) series #----------------------------------------------------------------------------- """ Input: embeddedSeries is a list of numbers comparisons is a minimum number of comparisons to calculate radius is a number that determines the maximum difference that defines similar data points percentIsometry() calculates the approximate percent of similar numbers in the series For each similar pair, it also calculates if consecutive numbers are similar When embedded series are passed as first argument, isometry is measured (instead of similarity) """ def percentIsometry(embeddedSeries, comparisons = 50000, radius = 1.): s = array(embeddedSeries).copy(); l = len(s)-1; r = radius isometries = 0.; consecutiveIsometries = 0.; cnt = 0 # SLOW # ## each pass finds 2 random numbers and compares them, if they are similar within cutoff radius r, isometry is counted ## if isometry is detected, successive numbers are also tested, and if similar, consecutive isometry is counted #for i in range(comparisons): # x1 = int(round((l-1) * rand())) # x2 = int(round((l-1) * rand())) # if abs(s[x1] - s[x2]) < r: # isometries += 1 # if abs(s[x1+1] - s[x2+1]) < r: # consecutiveIsometries += 1 #goes thru while loop while number of comparisons is less than given one while cnt < comparisons: x = int(round((l-2) * rand())+1); rec = abs(s[:-x]-s[x:]) < r; crec = rec[:-1]*rec[1:] rec.sort(); isometries += len(rec)-rec.searchsorted(True); cnt += len(rec) crec.sort(); consecutiveIsometries += len(crec)-crec.searchsorted(True) return 100.0*isometries/cnt,100.0*(isometries>0 and consecutiveIsometries/isometries or 0) ## EXAMPLE: ## embeddSeries = [7.483,7.810,11.225,9.539] ## radius = 1 ## only first two points are isometric with each other within radius 1 (7.483 and 7.810) ## there are total of 4*3 possible combinations ## method above actually compares a sample of all possible combinations, ## but in this small example it is easier to compute exact value ## portionIsometry = 2 isometric combinations / 4*3 possible combinations = 1/6 ## percentIsometry = 100 * 1/6 = 16.7% ## NOTE: ## SLOW version gives better results for small number of comparisons, because it samples ## really random pairs of vectors. Resulting plot is smooth for a few thousands of comparisons. ## Fast version on the other hand samples random sequences of consecutive vectors which introduces ## artifacts in the results that are, as experiments have shown, eliminated with the high enough ## number of comparisons. This algorithm is much faster than SLOW one, and I give here small illustration ## of when to expect considerable artifact. ## ## ILLUSTRATION: ## If the series has 5000 data points, and the number of comparisons is 1000, the SLOW method gives good results. ## Fast method DOES NOT give good results. Results are totally eratic. ## This is because the while loop executes only 1 or few passes. ## Lets say, for example, that random position selected is 2314. ## Then, numbers on positions 2314..5000 will be compared with the number on positions 1..2686. (2686=5000-2314) ## Since 2686 comparisons is greater than 1000 comparisons, condition is met, and while loop exits. ## Comparing 2 subsequences of length > 1000 with each other is nothing like comparing 1000 random pairs. ## On the other hand, if comparisons values is 10000, then results are much better. ## This is because there were a few passes thru the while loop, and some randomization of sampling is introduced. ## With the comparison value of 100000, results are even better, curve is smooth, ## and it is still faster than the SLOW algorithm, even thought there were 20 times more comparisons! ## ## CONCLUSION: number of comparisons SHOULD BE greater than the length of the series for an order of magnitude. ## Experiments have shown that for very long series (>50000), number of comparisons does not need to be greater ## than the length of series. I guess that ergodicity of the series analyzed has to do something with this. ## TO LOOK FOR: resulting curve should be smooth. #----------------------------------------------------------------------------- # Calculating percent isometry for the series on a range of radius values #----------------------------------------------------------------------------- def radialIsometry(rawSeries, radius = [.0001,.001,.01,.1,.2,.5,1.,2.,5.,10.]): s = array(rawSeries).copy(); smax = s.max(); smin = s.min(); srange = smax - smin radialIsometry = []; radialConsecutiveIsometry = [] for i in radius: # radius as % of a range of series = radius * srange / 100 a, b = percentIsometry(rawSeries, comparisons, i * srange / 100) radialIsometry.append(a) radialConsecutiveIsometry.append(b) return radialIsometry, radialConsecutiveIsometry ## This method differentiates simple causal processes from random processes. ## For a very small radius, percentIsometry will be zero or close to zero for most time series, ## excluding trivial examples like steady states or descreet periodicities. ## Random processes show gradual increase in ConsecutiveIsometry with the increase in radius. ## Causal processes show sudden initial increase, and then increase becomes gradual. ## This means that as soon as there are isometries (radius is sufficiently large), ## there will also be consecutive isometries. Since in causal processes, simmilar causes ## produce simmilar effects, this is to be expected - as soon as there are recurrent (similar, ## isometric) values, they will have recurrent (similar, isometric) consecutive values. ## Simmilar and isometric are in this case equivalent, since raw series are being analyzed ## (embedding dimension is 1, or in other words, there is no embedding). #----------------------------------------------------------------------------- # Finding positions of isometric pairs in the (embedded) series #----------------------------------------------------------------------------- """ Input: embeddedSeries is a list of numbers window is a number that determines the maximum distance between positions of any 2 data points to compare radius is a number that determines the maximum difference that defines similar data points recurrencePlotVector() calculates positions of similar numbers within given distance of separation (window). It maps these values to a vector to be plotted. Unlike common recurrence plots that have diagonal symmetry, this plot shows only one recurrence dot for each pair, and is rotated for 45 degrees clock wise. It may be more or less easy to understand than common recurrence plots, depending on the user. """ def recurrencePlotVector(embeddedSeries, window = 150, radius = 1.): s = array(embeddedSeries).copy(); r = radius recurrencePlot = []; recurrencePlotX = []; recurrencePlotY = [] # SLOW # ## compares each 2 numbers in the series that are separated from each other by no more than a parameter window ## if the difference is less than r, maps their positions to a vector for plotting #for i in range(len(s) - 1): # for j in range(window): # if i - j > -1 and abs(s[i] - s[i-j]) < r: # recurrencePlot.append([i - j/2, j/2]) for i in range(1,window+1): rec = abs(s[:-i] - s[i:]) < r ind = rec*array(range(1,len(rec)+1)) ind.sort(); st = ind.searchsorted(True); ind = ind[st:] ind2 = ind[:]+i recurrencePlotX.extend(ind.tolist()); recurrencePlotY.extend(ind2.tolist()) recurrencePlotX.extend(ind2.tolist()); recurrencePlotY.extend(ind.tolist()) recurrencePlot = array([recurrencePlotX,recurrencePlotY]) recurrencePlot = recurrencePlot.transpose() return recurrencePlot ## Example: ## embeddSeries = [ 7.483 , 7.810 , 11.225 , 9.539 ] ## indexes = [ 1 , 2 , 3 , 4 ] ## radius = 2 ## within radius 2, points isometric with each other and their indexes are: ## points indexes ## (point m, point n) (ind m, ind n) ## (7.483, 7.810) (1,2) ## (7.810, 9.539) (2,4) ## (11.225, 9.539) (3,4) ## resulting recurrentPlot vector will have indexes of these pairs as its values: #----------------------------------------------------------------------------- # Calculating average S.D. of all sub-series of lengths 2, 3, ..., embedding #----------------------------------------------------------------------------- def localDiversification(series, embedding): s = array(series).copy(); t = s.copy(); t2 = s.copy()*0; locdiv = [] for i in range(2,embedding+1): # SLOW # #for j in range(len(s)-embedding+1): # t2[j] = s[j:j+embedding].std() #locdiv.append(t2[:-i].mean()) t[0:-i+1] += s[i-1:] mean = t / (array([i+0.]*len(t))) t2= (s - mean - 0.0)**2; for j in range(i-1): t2[:-(j+1)]+= (s[j+1:] - mean[:-(j+1)] - 0.0)**2 stdev = sqrt(t2 / (array([i-1.]*len(t)))) locdiv.append(stdev[:-i+1].mean()) return locdiv ## EXAMPLE: ## rawSeries = [2,4,6,3,9,1] ## embedding = 2 ## embeddedSeries = [[2,4],[4,6],[6,3],[3,9],[9,1]] ## localDiversification = average(stdev(2,4),stdev(4,6),stdev(6,3),stdev(3,9),stdev(9,1)) ## localDiversification = average(1.41,1.41,2.12,4.24,5.66) = 2.968 ## embedding = 3 ## embeddedSeries = [[2,4,6],[4,6,3],[6,3,9],[3,9,1]] ## localDiversification = average(stdev(2,4,6),stdev(4,6,3),stdev(6,3,9),stdev(3,9,1)) ## localDiversification = average(2.00,1.53,3.00,4.16) = 2.67 #----------------------------------------------------------------------------- # Shuffling the series using existing 'shuffle' class #----------------------------------------------------------------------------- def randomize(series): s = array(series).copy(); shuffle(s); return s ## EXAMPLE: ## series = [2,4,6,3,9,1] ## shuffle = [4,6,9,2,1,3] #----------------------------------------------------------------------------- # Calculating slope (diversity) and 2-bin intercept (symmetry) of the series #----------------------------------------------------------------------------- def calculateEntropy(series, '\tEntropy bin:'s = [2,4,8,16,32,64]): s = array(series).copy(); smin = s.min(); smax = s.max(); slen = len(s) srange = smax - smin; entropy = [] bins = entropyBins for i in bins: if srange <> 0: counter = histogram(s,i)[0] ent = 0.; temp = 0. for k in range(i): if counter[k] <> 0: temp = (counter[k] / (slen+0.0)) * log(counter[k] / (slen+0.0)) / log(2) ent = ent - temp entropy.append(round(ent,5)) # Calculating linear regression for the resulting entropy series in the log2 scale def regressionLine(xseries, yseries): sumxx = 0.; sumyy = 0.; sumxy = 0.; sumx = 0.; sumy = 0.; a = 0.; b = 0.; x = 0. n = len(xseries) - 0. for i in(range(len(xseries))): x = log(xseries[i]) / log(2) sumx = sumx + x sumy = sumy + yseries[i] sumxx = sumxx + x * x sumyy = sumyy + yseries[i] * yseries[i] sumxy = sumxy + x * yseries[i] Sxx = sumxx - sumx * sumx / n Syy = sumyy - sumy * sumy / n Sxy = sumxy - sumx * sumy / n b = Sxy / Sxx a = (sumy - b * sumx) / n a = a + b slope = round(b, 3) intercept = round(a, 3) return slope, intercept slope, intercept = regressionLine(bins, entropy) return entropy, intercept, slope #----------------------------------------------------------------------------- # Calculating whatever users (you) submit #----------------------------------------------------------------------------- def userFunction(series, *userArguments): pass #until you submit some function #