如何加快这个C++代码（尤其是factorial和pow部分）

我正在做这个问题，称为在Hackerrank上的算术级数。如何加快这个C++代码（尤其是factorial和pow部分）

https://www.hackerrank.com/challenges/arithmetic-progressions

我的解决方案通过了所有的前六个测试。我已尽全力优化我的代码，包括缓存，更高效的操作。我仍然无法通过测试。我认为我失败的两个地方是因子函数和pow函数。

基本上，我用unordered_map来存储所有以前的结果。如果论证是关键之一，我会马上返回结果。

这里是我的代码：

#include <cstdio> 
#include <vector> 
#include <iostream> 
#include <algorithm> 
#include <unordered_map> 
#include <utility> 
#include <ctime> 
#include <sys/time.h> 
using namespace std; 


#define mod(m) (m > 1000003) ? m % 1000003 : m 

//used for hashing the pair 
namespace std { 

template<typename a, typename b> 
struct hash< pair<a, b> > { 
private: 
    const hash<a> ah; 
    const hash<b> bh; 
public: 
    hash() : ah(), bh() {}; 
    size_t operator()(const std::pair<a, b> &p) const { 
     return ah(p.first)^bh(p.second); 
    } 
}; 
} // namespaces 

//the code below is used for collecting statistics 
/* 
typedef unsigned long long timestamp_t; 

static timestamp_t get_timestamp() 
{ 
    struct timeval now; 
    gettimeofday (&now, NULL); 
    return now.tv_usec + (timestamp_t)now.tv_sec * 1000000; 
} 
*/ 

//note that the number could get really large, that's why I use unsigned long long for most of data type 
static unsigned long long cache_D[100000]; 
static unsigned long long cache_d[100000]; 
static unsigned long long cache_p[100000]; 
static unordered_map<unsigned long long, unsigned long long> cache_F;   //use  unordered_map to store the factorial, avg insert, lookup O(1) 
static unordered_map<pair<unsigned long long, unsigned long long>, unsigned long long> cache_P; //use unordered_map to store the pow 
//static double pow_sec = 0; 
//static double fac_sec = 0; 


/** 
* Use the fast pow algorithm. On top of that, I add caching (stored in unordered map) to  speed up the pow 
* @param x base 
* @param y exponent 
* @return x^y 
*/ 
unsigned long long pow(unsigned long long x, unsigned long long y) 
{ 
    //timestamp_t t0 = get_timestamp(); 
    pair<unsigned long long, unsigned long long> curr(x, y); 
    if(cache_P.find(curr) != cache_P.end()) 
    { 
     //timestamp_t t1 = get_timestamp(); 
     //pow_sec += (t1 - t0)/1000000.0L; 
     return cache_P[curr]; 
    } 
    unsigned long long result = 1; 
    unsigned long long mod_x = mod(x); 
    //unsigned long long count = 0; 

    while(y) 
    { 
     if (y & 1) 
     { 
      unsigned long long temp = result * mod_x; 
      result = mod(temp); 
     } 
     y >>= 1; 
     unsigned long long temp = mod_x * mod_x; 
     mod_x = mod(temp); 
    } 
    cache_P[curr] = result; 
    //timestamp_t t1 = get_timestamp(); 
    //pow_sec += (t1 - t0)/1000000.0L; 
    return result; 
} 

/** 
* same idea as pow, caching whenever I can 
* @param x number to be factorialized 
* @return x! 
*/ 
unsigned long long factorial(unsigned long long x) 
{ 
    //timestamp_t t0 = get_timestamp(); 
    if (cache_F.find(x) != cache_F.end()) 
    { 
     //timestamp_t t1 = get_timestamp(); 
     //fac_sec += (t1 - t0)/1000000.0L; 
     return cache_F[x]; 
    } 
    else 
    { 
     unsigned long long result = 1; 
     //here we go from x to 1 since we could speed up operation as soon as we have x - 1 or x - 2 or x - 3 in our caching (just x * (x - 1)!) 
     for(unsigned long long i = x; i >= 1; i--) 
     { 
      if(cache_F.find(i) != cache_F.end()) 
      { 
       unsigned long long temp1 = result * cache_F[i]; 
       result = mod(temp1); 
       cache_F[x] = result; 
       //timestamp_t t1 = get_timestamp(); 
       //fac_sec += (t1 - t0)/1000000.0L; 
       return result; 
      } 
      unsigned long long mod_i = mod(i); 
      unsigned long long temp2 = result * mod_i; 
      result = mod(temp2); 
     } 
     cache_F[x] = result; 
     //timestamp_t t1 = get_timestamp(); 
     //fac_sec += (t1 - t0)/1000000.0L; 
     return result; 
    } 
} 
void query(int from, int to) 
{ 
    unsigned long long k = 0; 
    unsigned long long constant = 1; 

    for(int i = from - 1; i < to; i++) 
    { 
     k += cache_p[i]; 
     unsigned long long temp = constant * cache_D[i]; 
     constant = mod(temp); 
    } 
    unsigned long long temp = constant * factorial(k); 
    constant = mod(temp); 
    printf("%llu %llu\n", k, constant); 
} 


void update(int from, int to, int how_much) 
{ 
    for(int i = from - 1; i < to; i++) 
    { 
     cache_p[i] += how_much; 
     unsigned long long temp = cache_D[i] * pow(cache_d[i], (unsigned long long)how_much); 
     cache_D[i] = mod(temp); 
    } 
} 

int main() { 
    int num_vec, num_operations; 
    FILE *pFile = fopen("input.txt", "r"); 

    fscanf(pFile, "%d", &num_vec); 
    for(int i = 0; i < num_vec; i++) 
    { 
     unsigned long long a, d, q; 
     fscanf(pFile, "%llu %llu %llu", &a, &d, &q); 
     cache_d[i] = d; 
     cache_p[i] = q; 
     cache_D[i] = pow(d, q); 
    } 
    fscanf(pFile, "%d", &num_operations); 
    for(int i = 0; i < num_operations; i++) 
    { 
     int what_operation, from, to; 
     fscanf(pFile, "%d %d %d", &what_operation, &from, &to); 
     if(what_operation == 0) 
     { 
      query(from, to); 
     } 
     else if (what_operation == 1) 
     { 
      int add_q; 
      fscanf(pFile, "%d", &add_q); 
      update(from, to, add_q); 
     } 
    } 
    printf("sec for pow: %f\n sec for fac: %f", pow_sec, fac_sec); 
    return 0; 
}

如果有谁知道如何进一步优化我的代码，这将是很有益的。谢谢！

来源

2013-05-27 Allen Liu

你有什么资料？它在哪里比较慢？ –

那么I/O操作呢？也许你应该用'fread'而不是'fscanf'来读取文件。并在读取文件后处理数据。 –

也许这个问题应该更好地去codereview – PlasmaHH

关于析因。

“无符号长长” 从0到

18,446,744,073,709,551,615

一系列

的21阶乘是：

51,090,942,171,709,440,000

因此，这意味着你的函数只能返回正确的0到20的阶乘结果。

以预构建的c开始会容易很多疼痛与21个元素（零到20）。

unsigned long long fact_cache [21] = { 

1      , 
1      , 
2      , 
6      , 
24      , 
120     , 
720     , 
5040     , 
40320     , 
362880     , 
3628800    , 
39916800    , 
479001600    , 
6227020800    , 
87178291200   , 
1307674368000   , 
20922789888000   , 
355687428096000  , 
6402373705728000  , 
121645100408832000  , 
2432902008176640000      

};

现在，您的因子函数只需在数组中查找正确的值（如果喜欢就检查边界）。

编辑：（实现了OP意“阶乘MOD 1000003“）

我开始阅读答案：基于第二个答案

Fast way to calculate n! mod m where m is prime?

，与python代码示例，以及更多信息：

http://www.algorithmist.com/index.php/Modular_inverse

他给了这个Python代码，以及一个可能的改进。

def factorialMod(n, modulus): 
    ans=1 
    if n <= modulus//2: 
     #calculate the factorial normally (right argument of range() is exclusive) 
     for i in range(1,n+1): 
      ans = (ans * i) % modulus 
    else: 
     #Fancypants method for large n 
     for i in range(n+1,modulus): 
      ans = (ans * i) % modulus 
     ans = modinv(ans, modulus) 
     ans = -1*ans + modulus 
    return ans % modulus

这有一定的优势，在当n大于模数除以2时，因为我们可以通过求解模逆减少乘法的数目的原始方法。因为我们知道模数（1000003），我们可以通过使用它的总数（1000002）来求解modinv。

在伪代码中，我们得到的东西，如：

long factorial_mod(long n) { 
    if(n > 1000003) 
     return 0; //from Thomas's comment 

    if(cache[n] != 0) //if the cache has the answer 
     return cache[n]; 

    long result = 1; 
    if (n <= 500001) 
    { 
     for(long i = 1; i <= n; i++) 
     { 
      result = (result * i) % 1000003; 
      cache[i] = result; 
     } 
    } 
    else 
    { 
     for(long i = n+1; i <= 1000003; i++) 
     { 
      result = (result * i) % 1000003; 
     } 
     result = modinv(result, 1000003); 
     result = -1*result + 1000003; 
    } 

    result = result % 1000003; 
    cache[n] = result; 
    return result; 
} 

long modinv(long a, int modulus) { 
    return modPow(a, 1000002, modulus); // ((a to the totient) mod the modulus) 
}

如果我们不想计算欧拉，我们可以使用扩展欧拉GCD来解决模逆。（当然素数的总数很容易计算......只需要减去一个）。

来源

2013-05-27 22:04:16 Xantix

问题是，因为数字可能会变得非常大，所以问题指出我们可以使用mod 1000003.因此技术上x可能大于21，但其结果仍然在unsigned long long的范围内。 –

好点。在关于阶乘函数的描述/评论中，没有提到“mod”，所以我一定错过了这个。 – Xantix

@AllenLiu你允许多少内存？存储从0到1000002的所有阶乘余数（以后的所有内容都是零）应该适合大约4兆字节的内存。您可以快速预先计算它们，而不是不断重新计算所有内容，并缓存高速缓存。 – Thomas

如何加快这个C++代码（尤其是factorial和pow部分）

回答

相关问题