Mangled file names in temp dir
Pavel Sanda
sanda at lyx.org
Wed Nov 8 09:44:16 UTC 2023
On Thu, Oct 26, 2023 at 01:23:35PM +0200, Pavel Sanda wrote:
> Possible collision are easily handled by recursive hash(hash(..(hash(string))),
> see attached patch.
Polished patch attached. I will do some more testing before committing to the tree.
Pavel
-------------- next part --------------
diff --git a/src/insets/InsetGraphics.cpp b/src/insets/InsetGraphics.cpp
index dac61ee449..62577f07db 100644
--- a/src/insets/InsetGraphics.cpp
+++ b/src/insets/InsetGraphics.cpp
@@ -575,14 +575,14 @@ copyFileIfNeeded(FileName const & file_in, FileName const & file_out)
pair<GraphicsCopyStatus, FileName> const
-copyToDirIfNeeded(DocFileName const & file, string const & dir)
+copyToDirIfNeeded(DocFileName const & file, string const & dir, bool encrypt_path)
{
string const file_in = file.absFileName();
string const only_path = onlyPath(file_in);
if (rtrim(only_path, "/") == rtrim(dir, "/"))
return make_pair(IDENTICAL_PATHS, FileName(file_in));
- string mangled = file.mangledFileName(empty_string(), false, true);
+ string mangled = file.mangledFileName(empty_string(), encrypt_path);
if (theFormats().isZippedFile(file)) {
// We need to change _eps.gz to .eps.gz. The mangled name is
// still unique because of the counter in mangledFileName().
@@ -680,7 +680,7 @@ string InsetGraphics::prepareFile(OutputParams const & runparams) const
// we move it to a temp dir or uncompress it.
FileName temp_file;
GraphicsCopyStatus status;
- tie(status, temp_file) = copyToDirIfNeeded(params().filename, temp_path);
+ tie(status, temp_file) = copyToDirIfNeeded(params().filename, temp_path, false);
if (status == FAILURE)
return orig_file;
@@ -997,7 +997,7 @@ string InsetGraphics::prepareHTMLFile(OutputParams const & runparams) const
// Copy to temporary directory.
FileName temp_file;
GraphicsCopyStatus status;
- tie(status, temp_file) = copyToDirIfNeeded(params().filename, temp_path);
+ tie(status, temp_file) = copyToDirIfNeeded(params().filename, temp_path, true);
if (status == FAILURE)
return string();
diff --git a/src/support/FileName.cpp b/src/support/FileName.cpp
index 32a5862fdb..7fbd2f4b4c 100644
--- a/src/support/FileName.cpp
+++ b/src/support/FileName.cpp
@@ -957,69 +957,102 @@ string DocFileName::outputFileName(string const & path) const
return save_abs_path_ ? absFileName() : relFileName(path);
}
-
-string DocFileName::mangledFileName(string const & dir) const
+// check for an existing value in a map
+bool collision_exists(map<string, string> const & db, string const & value)
{
- return mangledFileName(dir, true, false);
+ for (auto it = db.begin(); it != db.end(); ++it)
+ if (it->second == value)
+ return true;
+ return false;
+
+/* replace the above, once we support C++17
+ for (const auto& [key, val] : db)
+ if (val == value)
+ return true;
+ return false; */
}
-string DocFileName::mangledFileName(string const & dir, bool use_counter, bool encrypt_path) const
+string DocFileName::mangledFileName(string const & dir, bool encrypt_path) const
{
// Concurrent access to these variables is possible.
// We need to make sure that every DocFileName instance for a given
// filename returns the same mangled name.
typedef map<string, string> MangledMap;
- static MangledMap mangledNames;
+ static MangledMap mangledPlainNames;
+ static MangledMap mangledEncryptNames; //separate table for xhtml() route
static Mutex mangledMutex;
// this locks both access to mangledNames and counter below
Mutex::Locker lock(&mangledMutex);
- MangledMap::const_iterator const it = mangledNames.find(absFileName());
- if (it != mangledNames.end())
+ MangledMap * mangledNames = encrypt_path ? &mangledEncryptNames : &mangledPlainNames;
+ MangledMap::const_iterator const it = mangledNames->find(absFileName());
+ if (it != mangledNames->end())
return (*it).second;
string const name = absFileName();
- // Now the real work. Remove the extension.
- string mname = support::changeExtension(name, string());
+ string mname;
- if (encrypt_path)
- mname = "export_" + onlyFileName() + "_" + toHexHash(mname);
+ // xHTML route
+ // we use hash instead of counter to get stable filenames in export directory
+ if (encrypt_path) {
+ // sanitization probably not neccessary for xhtml, but won't harm
+ string sanfn = support::changeExtension(onlyFileName(), string());
+ sanfn = sanitizeFileName(sanfn);
+ // Add the extension back on
+ sanfn = support::changeExtension(sanfn, getExtension(onlyFileName()));
- // The mangled name must be a valid LaTeX name.
- mname = sanitizeFileName(mname);
- // Add the extension back on
- mname = support::changeExtension(mname, getExtension(name));
+ //various filesystems have filename limit around 2^8
+ if (sanfn.length() > 230)
+ sanfn = "";
- // Prepend a counter to the filename. This is necessary to make
- // the mangled name unique.
- static int counter = 0;
+ string enc_name = "e_" + toHexHash(name, true) + "_" + sanfn;
+
+ while (collision_exists(mangledEncryptNames, enc_name))
+ enc_name = "e_" + toHexHash(enc_name, true) + "_" + sanfn;
+
+ mname = enc_name;
+ }
+
+ // LaTeX route
+ if (!encrypt_path) {
+ // Now the real work. Remove the extension.
+ mname = support::changeExtension(name, string());
+ // The mangled name must be a valid LaTeX name.
+ mname = sanitizeFileName(mname);
+ // Add the extension back on
+ mname = support::changeExtension(mname, getExtension(name));
+
+ // Prepend a counter to the filename. This is necessary to make
+ // the mangled name unique, see truncation below.
+ //
+ static int counter = 0;
- if (use_counter) {
ostringstream s;
s << counter++ << mname;
mname = s.str();
- }
- // MiKTeX's YAP (version 2.4.1803) crashes if the file name
- // is longer than about 160 characters. MiKTeX's pdflatex
- // is even pickier. A maximum length of 100 has been proven to work.
- // If dir.size() > max length, all bets are off for YAP. We truncate
- // the filename nevertheless, keeping a minimum of 10 chars.
- string::size_type max_length = max(100 - ((int)dir.size() + 1), 10);
+ // MiKTeX's YAP (version 2.4.1803) crashes if the file name
+ // is longer than about 160 characters. MiKTeX's pdflatex
+ // is even pickier. A maximum length of 100 has been proven to work.
+ // If dir.size() > max length, all bets are off for YAP. We truncate
+ // the filename nevertheless, keeping a minimum of 10 chars.
- // If the mangled file name is too long, hack it to fit.
- // We know we're guaranteed to have a unique file name because
- // of the counter.
- if (mname.size() > max_length) {
- int const half = (int(max_length) / 2) - 2;
- if (half > 0) {
- mname = mname.substr(0, half) + "___" +
- mname.substr(mname.size() - half);
+ string::size_type max_length = max(100 - ((int)dir.size() + 1), 10);
+
+ // If the mangled file name is too long, hack it to fit.
+ // We know we're guaranteed to have a unique file name because
+ // of the counter.
+ if (mname.size() > max_length) {
+ int const half = (int(max_length) / 2) - 2;
+ if (half > 0) {
+ mname = mname.substr(0, half) + "___" +
+ mname.substr(mname.size() - half);
+ }
}
}
- mangledNames[absFileName()] = mname;
+ (*mangledNames)[absFileName()] = mname;
return mname;
}
diff --git a/src/support/FileName.h b/src/support/FileName.h
index 95940a9235..35f182e428 100644
--- a/src/support/FileName.h
+++ b/src/support/FileName.h
@@ -288,19 +288,30 @@ public:
* - two FileName instances with the same filename have identical
* mangled names.
*
+ * @param encrypt_path will use short hash instead of path
+ * in the mangled name if set to true. Useful for xHTML export
+ * so we do not leak e.g. user names contained in the paths.
+ * Filename itself is stripped if the resulting filename would
+ * become too long (~250 chars).
+ * Prefix counter is not used because
+ * 1) it's hack useful for MikTeX/YAP only
+ * 2) causes many duplicates within the export directory across different
+ * LyX sessions as (unlike in LaTeX export) we use mangled names in
+ * final xHTML export directory.
+ * An example of hashed mangled case:
+ * C:/foo bar/baz.png - > e_95a42ec852ea_baz.png
+ *
+ * It is guaranteed that
+ * - two different filenames have different mangled names (modulo hash collision)
+ * - two FileName instances with the same filename have identical hashed
+ * mangled names.
+ *
+ *
* Only the mangled file name is returned. It is not prepended
* with @c dir.
*/
std::string
- mangledFileName(std::string const & dir = empty_string()) const;
-
- /** Identical to mangledFileName, with the following additions:
- *
- * @encrypt_path allows using hash (SHA-256) instead of full path.
- * @use_counter allows disabling the counter in the filename.
- */
- std::string
- mangledFileName(std::string const & dir, bool use_counter, bool encrypt_path) const;
+ mangledFileName(std::string const & dir = empty_string(), bool encrypt_path=false) const;
/// \return the absolute file name without its .gz, .z, .Z extension
std::string unzippedFileName() const;
diff --git a/src/support/filetools.cpp b/src/support/filetools.cpp
index e8f3aa19ce..a2eba83eef 100644
--- a/src/support/filetools.cpp
+++ b/src/support/filetools.cpp
@@ -1334,13 +1334,22 @@ void fileUnlock(int fd, const char * /* lock_file*/)
}
-std::string toHexHash(const std::string & str)
+std::string toHexHash(const std::string & str, bool shorten)
{
- // Use the best available hashing algorithm.
auto hashAlgo = QCryptographicHash::Sha256;
QByteArray hash = QCryptographicHash::hash(toqstr(str).toLocal8Bit(), hashAlgo);
- return fromqstr(QString(hash.toHex()));
+ QString qshash=QString(hash.toHex());
+
+ /* For shortened case we take 12 leftmost chars (6 bytes encoded).
+ * Random experiment shows:
+ * 8 chars: 16 collisions for 10^5 graphic filenames
+ * 12 chars: 0 collisions for 10^5 graphic filenames
+ */
+ if (shorten)
+ qshash=qshash.left(12);
+
+ return fromqstr(qshash);
}
diff --git a/src/support/filetools.h b/src/support/filetools.h
index 205e7ff589..ceb1da62be 100644
--- a/src/support/filetools.h
+++ b/src/support/filetools.h
@@ -348,14 +348,16 @@ int fileLock(const char * lock_file);
void fileUnlock(int fd, const char * lock_file);
/** Return the hex-encoded cryptographic hash of a string.
- * The hash algorithm is not fixed, but it is determined at compile time.
* This function is typically used to create relatively stable file names,
* because cryptographic hash functions ensure that very small changes in the
* input result in large changes in the output.
+ * Due to inherent limits of path length in Windows we allow very short version
+ * (effectively leftmost 6 bytes encoded via 12 chars in hex) while increasing the
+ * probability of collision (via shorten=true).
* There is no limit in the length of the input string: it can be a file name
* or the contents of a file, for instance.
*/
-std::string toHexHash(const std::string & str);
+std::string toHexHash(const std::string & str, bool shorten=false);
/// Replace non-ASCII characters to ensure that the string can be used as a
/// file name on all platforms and as a LaTeX name.
More information about the lyx-devel
mailing list